diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,154022 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.999366420274551, + "global_step": 11830, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25816988945007324, + "epoch": 0.0, + "learning_rate": 4.999577345731192e-05, + "loss": 0.2805, + "step": 1, + "task_loss": 0.5853821039199829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18578635156154633, + "epoch": 0.0, + "learning_rate": 4.9991546914623835e-05, + "loss": 0.1863, + "step": 2, + "task_loss": 0.16780924797058105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2388908416032791, + "epoch": 0.0, + "learning_rate": 4.998732037193576e-05, + "loss": 0.4048, + "step": 3, + "task_loss": 0.38245245814323425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2596370279788971, + "epoch": 0.0, + "learning_rate": 4.998309382924768e-05, + "loss": 0.3622, + "step": 4, + "task_loss": 0.6480276584625244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4107702672481537, + "epoch": 0.0, + "learning_rate": 4.9978867286559594e-05, + "loss": 0.3972, + "step": 5, + "task_loss": 0.26988765597343445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1514049470424652, + "epoch": 0.01, + "learning_rate": 4.9974640743871514e-05, + "loss": 0.3625, + "step": 6, + "task_loss": 0.46446821093559265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28078120946884155, + "epoch": 0.01, + "learning_rate": 4.9970414201183434e-05, + "loss": 0.3568, + "step": 7, + "task_loss": 0.429648220539093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27217891812324524, + "epoch": 0.01, + "learning_rate": 4.9966187658495354e-05, + "loss": 0.4557, + "step": 8, + "task_loss": 0.17257165908813477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.270322322845459, + "epoch": 0.01, + "learning_rate": 4.996196111580727e-05, + "loss": 0.466, + "step": 9, + "task_loss": 0.3322319984436035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4226338565349579, + "epoch": 0.01, + "learning_rate": 4.995773457311919e-05, + "loss": 0.2858, + "step": 10, + "task_loss": 0.35513943433761597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17420129477977753, + "epoch": 0.01, + "learning_rate": 4.995350803043111e-05, + "loss": 0.3424, + "step": 11, + "task_loss": 0.36419224739074707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33628472685813904, + "epoch": 0.01, + "learning_rate": 4.9949281487743026e-05, + "loss": 0.3603, + "step": 12, + "task_loss": 0.4097657799720764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42833825945854187, + "epoch": 0.01, + "learning_rate": 4.9945054945054945e-05, + "loss": 0.3342, + "step": 13, + "task_loss": 0.6553051471710205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3272498846054077, + "epoch": 0.01, + "learning_rate": 4.9940828402366865e-05, + "loss": 0.29, + "step": 14, + "task_loss": 0.41925597190856934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31984272599220276, + "epoch": 0.01, + "learning_rate": 4.9936601859678785e-05, + "loss": 0.4187, + "step": 15, + "task_loss": 0.4311681389808655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2317107617855072, + "epoch": 0.01, + "learning_rate": 4.9932375316990705e-05, + "loss": 0.2976, + "step": 16, + "task_loss": 0.7428198456764221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43023306131362915, + "epoch": 0.01, + "learning_rate": 4.9928148774302624e-05, + "loss": 0.3232, + "step": 17, + "task_loss": 1.6245992183685303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4810776710510254, + "epoch": 0.02, + "learning_rate": 4.992392223161454e-05, + "loss": 0.5375, + "step": 18, + "task_loss": 0.6471912860870361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2258974015712738, + "epoch": 0.02, + "learning_rate": 4.991969568892646e-05, + "loss": 0.2664, + "step": 19, + "task_loss": 1.0436300039291382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4231599271297455, + "epoch": 0.02, + "learning_rate": 4.9915469146238384e-05, + "loss": 0.5146, + "step": 20, + "task_loss": 0.9995765089988708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19174601137638092, + "epoch": 0.02, + "learning_rate": 4.9911242603550297e-05, + "loss": 0.3358, + "step": 21, + "task_loss": 0.8044640421867371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44361889362335205, + "epoch": 0.02, + "learning_rate": 4.9907016060862216e-05, + "loss": 0.3343, + "step": 22, + "task_loss": 0.6296488642692566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16035886108875275, + "epoch": 0.02, + "learning_rate": 4.9902789518174136e-05, + "loss": 0.3306, + "step": 23, + "task_loss": 0.7133399844169617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31523415446281433, + "epoch": 0.02, + "learning_rate": 4.9898562975486056e-05, + "loss": 0.2981, + "step": 24, + "task_loss": 0.15186113119125366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3411262035369873, + "epoch": 0.02, + "learning_rate": 4.9894336432797976e-05, + "loss": 0.4549, + "step": 25, + "task_loss": 1.002905011177063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30748438835144043, + "epoch": 0.02, + "learning_rate": 4.9890109890109895e-05, + "loss": 0.5096, + "step": 26, + "task_loss": 0.8128278851509094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3724653124809265, + "epoch": 0.02, + "learning_rate": 4.9885883347421815e-05, + "loss": 0.3626, + "step": 27, + "task_loss": 1.2241909503936768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1662927269935608, + "epoch": 0.02, + "learning_rate": 4.988165680473373e-05, + "loss": 0.2403, + "step": 28, + "task_loss": 0.2341037094593048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32311949133872986, + "epoch": 0.02, + "learning_rate": 4.987743026204565e-05, + "loss": 0.3691, + "step": 29, + "task_loss": 1.0425922870635986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4696677029132843, + "epoch": 0.03, + "learning_rate": 4.987320371935757e-05, + "loss": 0.3536, + "step": 30, + "task_loss": 0.5542616844177246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42097368836402893, + "epoch": 0.03, + "learning_rate": 4.986897717666949e-05, + "loss": 0.3675, + "step": 31, + "task_loss": 0.59581458568573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24710862338542938, + "epoch": 0.03, + "learning_rate": 4.986475063398141e-05, + "loss": 0.296, + "step": 32, + "task_loss": 0.6669486165046692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16573220491409302, + "epoch": 0.03, + "learning_rate": 4.986052409129333e-05, + "loss": 0.4524, + "step": 33, + "task_loss": 0.8582977652549744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6471076011657715, + "epoch": 0.03, + "learning_rate": 4.985629754860524e-05, + "loss": 0.4255, + "step": 34, + "task_loss": 0.6770200729370117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46286115050315857, + "epoch": 0.03, + "learning_rate": 4.985207100591716e-05, + "loss": 0.4156, + "step": 35, + "task_loss": 0.5535679459571838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21832047402858734, + "epoch": 0.03, + "learning_rate": 4.984784446322908e-05, + "loss": 0.3233, + "step": 36, + "task_loss": 0.27768674492836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2571682929992676, + "epoch": 0.03, + "learning_rate": 4.9843617920541006e-05, + "loss": 0.3658, + "step": 37, + "task_loss": 0.7903686761856079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4108579158782959, + "epoch": 0.03, + "learning_rate": 4.983939137785292e-05, + "loss": 0.4139, + "step": 38, + "task_loss": 0.3737262785434723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2741393744945526, + "epoch": 0.03, + "learning_rate": 4.983516483516484e-05, + "loss": 0.3012, + "step": 39, + "task_loss": 0.5845351815223694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21137279272079468, + "epoch": 0.03, + "learning_rate": 4.983093829247676e-05, + "loss": 0.3481, + "step": 40, + "task_loss": 1.2179186344146729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47420579195022583, + "epoch": 0.03, + "learning_rate": 4.982671174978867e-05, + "loss": 0.3995, + "step": 41, + "task_loss": 0.5109229683876038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19544543325901031, + "epoch": 0.04, + "learning_rate": 4.98224852071006e-05, + "loss": 0.3171, + "step": 42, + "task_loss": 0.26069581508636475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30108460783958435, + "epoch": 0.04, + "learning_rate": 4.981825866441252e-05, + "loss": 0.4314, + "step": 43, + "task_loss": 1.0975393056869507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21767646074295044, + "epoch": 0.04, + "learning_rate": 4.981403212172443e-05, + "loss": 0.3726, + "step": 44, + "task_loss": 0.4263548254966736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5715851187705994, + "epoch": 0.04, + "learning_rate": 4.980980557903635e-05, + "loss": 0.4301, + "step": 45, + "task_loss": 0.7781269550323486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4327733814716339, + "epoch": 0.04, + "learning_rate": 4.980557903634827e-05, + "loss": 0.4044, + "step": 46, + "task_loss": 0.6987329721450806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2926572561264038, + "epoch": 0.04, + "learning_rate": 4.980135249366019e-05, + "loss": 0.4134, + "step": 47, + "task_loss": 0.9054892659187317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2079082876443863, + "epoch": 0.04, + "learning_rate": 4.979712595097211e-05, + "loss": 0.2983, + "step": 48, + "task_loss": 0.1822921335697174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3556250035762787, + "epoch": 0.04, + "learning_rate": 4.979289940828403e-05, + "loss": 0.333, + "step": 49, + "task_loss": 0.21551582217216492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2854697108268738, + "epoch": 0.04, + "learning_rate": 4.978867286559594e-05, + "loss": 0.2963, + "step": 50, + "task_loss": 0.47764793038368225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3259928226470947, + "epoch": 0.04, + "learning_rate": 4.978444632290786e-05, + "loss": 0.3551, + "step": 51, + "task_loss": 0.4995303153991699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22598080337047577, + "epoch": 0.04, + "learning_rate": 4.978021978021978e-05, + "loss": 0.3064, + "step": 52, + "task_loss": 0.28154799342155457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25202736258506775, + "epoch": 0.04, + "learning_rate": 4.97759932375317e-05, + "loss": 0.3011, + "step": 53, + "task_loss": 0.23623007535934448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2742934226989746, + "epoch": 0.05, + "learning_rate": 4.977176669484362e-05, + "loss": 0.333, + "step": 54, + "task_loss": 0.49772605299949646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20836201310157776, + "epoch": 0.05, + "learning_rate": 4.976754015215554e-05, + "loss": 0.3544, + "step": 55, + "task_loss": 1.1939759254455566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2060907930135727, + "epoch": 0.05, + "learning_rate": 4.976331360946746e-05, + "loss": 0.3227, + "step": 56, + "task_loss": 0.744167685508728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1929110884666443, + "epoch": 0.05, + "learning_rate": 4.975908706677937e-05, + "loss": 0.3204, + "step": 57, + "task_loss": 0.5006382465362549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44293028116226196, + "epoch": 0.05, + "learning_rate": 4.975486052409129e-05, + "loss": 0.419, + "step": 58, + "task_loss": 0.31968680024147034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21261683106422424, + "epoch": 0.05, + "learning_rate": 4.975063398140322e-05, + "loss": 0.2863, + "step": 59, + "task_loss": 0.16719569265842438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33205848932266235, + "epoch": 0.05, + "learning_rate": 4.974640743871513e-05, + "loss": 0.3029, + "step": 60, + "task_loss": 0.36968085169792175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48490577936172485, + "epoch": 0.05, + "learning_rate": 4.974218089602705e-05, + "loss": 0.4028, + "step": 61, + "task_loss": 0.7566177845001221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22474291920661926, + "epoch": 0.05, + "learning_rate": 4.973795435333897e-05, + "loss": 0.3067, + "step": 62, + "task_loss": 1.0528796911239624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3649987578392029, + "epoch": 0.05, + "learning_rate": 4.9733727810650885e-05, + "loss": 0.3056, + "step": 63, + "task_loss": 0.6386075615882874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5773598551750183, + "epoch": 0.05, + "learning_rate": 4.9729501267962805e-05, + "loss": 0.4557, + "step": 64, + "task_loss": 0.6312403678894043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.11294424533843994, + "epoch": 0.05, + "learning_rate": 4.972527472527473e-05, + "loss": 0.2713, + "step": 65, + "task_loss": 0.34500980377197266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34409666061401367, + "epoch": 0.06, + "learning_rate": 4.972104818258665e-05, + "loss": 0.3823, + "step": 66, + "task_loss": 0.9285640716552734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4020559787750244, + "epoch": 0.06, + "learning_rate": 4.9716821639898564e-05, + "loss": 0.3368, + "step": 67, + "task_loss": 1.0970407724380493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47366437315940857, + "epoch": 0.06, + "learning_rate": 4.9712595097210484e-05, + "loss": 0.3769, + "step": 68, + "task_loss": 0.21794253587722778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4191763699054718, + "epoch": 0.06, + "learning_rate": 4.97083685545224e-05, + "loss": 0.3188, + "step": 69, + "task_loss": 0.7065517902374268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5290296077728271, + "epoch": 0.06, + "learning_rate": 4.970414201183432e-05, + "loss": 0.463, + "step": 70, + "task_loss": 0.6956005096435547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.419831246137619, + "epoch": 0.06, + "learning_rate": 4.969991546914624e-05, + "loss": 0.4419, + "step": 71, + "task_loss": 1.0496934652328491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2980637848377228, + "epoch": 0.06, + "learning_rate": 4.969568892645816e-05, + "loss": 0.387, + "step": 72, + "task_loss": 0.4555947184562683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28176164627075195, + "epoch": 0.06, + "learning_rate": 4.9691462383770076e-05, + "loss": 0.3574, + "step": 73, + "task_loss": 0.5821244120597839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5036299228668213, + "epoch": 0.06, + "learning_rate": 4.9687235841081995e-05, + "loss": 0.4155, + "step": 74, + "task_loss": 0.7105315327644348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7065041065216064, + "epoch": 0.06, + "learning_rate": 4.9683009298393915e-05, + "loss": 0.4075, + "step": 75, + "task_loss": 1.288073182106018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22645768523216248, + "epoch": 0.06, + "learning_rate": 4.9678782755705835e-05, + "loss": 0.4971, + "step": 76, + "task_loss": 0.45604434609413147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22015847265720367, + "epoch": 0.07, + "learning_rate": 4.9674556213017755e-05, + "loss": 0.2785, + "step": 77, + "task_loss": 0.14051614701747894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.335431843996048, + "epoch": 0.07, + "learning_rate": 4.9670329670329674e-05, + "loss": 0.3127, + "step": 78, + "task_loss": 0.6418079137802124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2796747386455536, + "epoch": 0.07, + "learning_rate": 4.966610312764159e-05, + "loss": 0.3599, + "step": 79, + "task_loss": 0.3722460865974426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5145119428634644, + "epoch": 0.07, + "learning_rate": 4.966187658495351e-05, + "loss": 0.3894, + "step": 80, + "task_loss": 0.5264815092086792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3600746989250183, + "epoch": 0.07, + "learning_rate": 4.965765004226543e-05, + "loss": 0.3373, + "step": 81, + "task_loss": 1.6320440769195557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28211796283721924, + "epoch": 0.07, + "learning_rate": 4.965342349957735e-05, + "loss": 0.2809, + "step": 82, + "task_loss": 0.6417269706726074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3130403459072113, + "epoch": 0.07, + "learning_rate": 4.9649196956889266e-05, + "loss": 0.2982, + "step": 83, + "task_loss": 0.25361379981040955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3775560259819031, + "epoch": 0.07, + "learning_rate": 4.9644970414201186e-05, + "loss": 0.3535, + "step": 84, + "task_loss": 0.48491528630256653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28904205560684204, + "epoch": 0.07, + "learning_rate": 4.9640743871513106e-05, + "loss": 0.3024, + "step": 85, + "task_loss": 1.2618898153305054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2344900667667389, + "epoch": 0.07, + "learning_rate": 4.963651732882502e-05, + "loss": 0.471, + "step": 86, + "task_loss": 0.2024116814136505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.457430899143219, + "epoch": 0.07, + "learning_rate": 4.9632290786136945e-05, + "loss": 0.4479, + "step": 87, + "task_loss": 0.9474542737007141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27561503648757935, + "epoch": 0.07, + "learning_rate": 4.9628064243448865e-05, + "loss": 0.2618, + "step": 88, + "task_loss": 0.6162384748458862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21126046776771545, + "epoch": 0.08, + "learning_rate": 4.962383770076078e-05, + "loss": 0.2443, + "step": 89, + "task_loss": 0.1814127415418625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35313093662261963, + "epoch": 0.08, + "learning_rate": 4.96196111580727e-05, + "loss": 0.3678, + "step": 90, + "task_loss": 0.453416109085083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4890415072441101, + "epoch": 0.08, + "learning_rate": 4.961538461538462e-05, + "loss": 0.3357, + "step": 91, + "task_loss": 0.49295541644096375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7142332792282104, + "epoch": 0.08, + "learning_rate": 4.961115807269654e-05, + "loss": 0.4798, + "step": 92, + "task_loss": 0.6236546635627747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3460538387298584, + "epoch": 0.08, + "learning_rate": 4.960693153000846e-05, + "loss": 0.5107, + "step": 93, + "task_loss": 1.159478783607483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27619296312332153, + "epoch": 0.08, + "learning_rate": 4.9602704987320377e-05, + "loss": 0.3218, + "step": 94, + "task_loss": 0.35865139961242676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6670293211936951, + "epoch": 0.08, + "learning_rate": 4.9598478444632296e-05, + "loss": 0.5086, + "step": 95, + "task_loss": 0.8902304172515869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35298633575439453, + "epoch": 0.08, + "learning_rate": 4.959425190194421e-05, + "loss": 0.4552, + "step": 96, + "task_loss": 1.6388323307037354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3052392303943634, + "epoch": 0.08, + "learning_rate": 4.959002535925613e-05, + "loss": 0.2989, + "step": 97, + "task_loss": 0.635475754737854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.359167218208313, + "epoch": 0.08, + "learning_rate": 4.958579881656805e-05, + "loss": 0.3379, + "step": 98, + "task_loss": 0.845808207988739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1947636753320694, + "epoch": 0.08, + "learning_rate": 4.958157227387997e-05, + "loss": 0.2466, + "step": 99, + "task_loss": 0.40310221910476685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5416989326477051, + "epoch": 0.08, + "learning_rate": 4.957734573119189e-05, + "loss": 0.4943, + "step": 100, + "task_loss": 1.3766098022460938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34332895278930664, + "epoch": 0.09, + "learning_rate": 4.957311918850381e-05, + "loss": 0.3289, + "step": 101, + "task_loss": 0.3026656210422516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28686532378196716, + "epoch": 0.09, + "learning_rate": 4.956889264581572e-05, + "loss": 0.3582, + "step": 102, + "task_loss": 1.122120976448059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4475458860397339, + "epoch": 0.09, + "learning_rate": 4.956466610312764e-05, + "loss": 0.4738, + "step": 103, + "task_loss": 0.8384937047958374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49370914697647095, + "epoch": 0.09, + "learning_rate": 4.956043956043957e-05, + "loss": 0.4248, + "step": 104, + "task_loss": 1.3479255437850952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4307611584663391, + "epoch": 0.09, + "learning_rate": 4.955621301775148e-05, + "loss": 0.3323, + "step": 105, + "task_loss": 0.9533501863479614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1903613805770874, + "epoch": 0.09, + "learning_rate": 4.95519864750634e-05, + "loss": 0.3374, + "step": 106, + "task_loss": 0.5240545868873596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2635105848312378, + "epoch": 0.09, + "learning_rate": 4.954775993237532e-05, + "loss": 0.3491, + "step": 107, + "task_loss": 1.090175747871399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25856882333755493, + "epoch": 0.09, + "learning_rate": 4.954353338968723e-05, + "loss": 0.3718, + "step": 108, + "task_loss": 0.12843522429466248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5180971026420593, + "epoch": 0.09, + "learning_rate": 4.953930684699916e-05, + "loss": 0.3546, + "step": 109, + "task_loss": 0.4853808581829071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2508522570133209, + "epoch": 0.09, + "learning_rate": 4.953508030431108e-05, + "loss": 0.4383, + "step": 110, + "task_loss": 0.2109997421503067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32342344522476196, + "epoch": 0.09, + "learning_rate": 4.9530853761623e-05, + "loss": 0.4014, + "step": 111, + "task_loss": 0.9165175557136536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2745503783226013, + "epoch": 0.09, + "learning_rate": 4.952662721893491e-05, + "loss": 0.3822, + "step": 112, + "task_loss": 0.040358565747737885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31443509459495544, + "epoch": 0.1, + "learning_rate": 4.952240067624683e-05, + "loss": 0.3267, + "step": 113, + "task_loss": 0.7784364223480225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20579645037651062, + "epoch": 0.1, + "learning_rate": 4.951817413355875e-05, + "loss": 0.411, + "step": 114, + "task_loss": 0.3764180541038513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4143642485141754, + "epoch": 0.1, + "learning_rate": 4.951394759087067e-05, + "loss": 0.3356, + "step": 115, + "task_loss": 0.4431930482387543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26486262679100037, + "epoch": 0.1, + "learning_rate": 4.950972104818259e-05, + "loss": 0.3657, + "step": 116, + "task_loss": 0.45691344141960144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5901132822036743, + "epoch": 0.1, + "learning_rate": 4.950549450549451e-05, + "loss": 0.4279, + "step": 117, + "task_loss": 0.6737844944000244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6378105878829956, + "epoch": 0.1, + "learning_rate": 4.950126796280642e-05, + "loss": 0.4166, + "step": 118, + "task_loss": 0.7184115648269653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24172967672348022, + "epoch": 0.1, + "learning_rate": 4.949704142011834e-05, + "loss": 0.4309, + "step": 119, + "task_loss": 0.2585623562335968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4076937437057495, + "epoch": 0.1, + "learning_rate": 4.949281487743026e-05, + "loss": 0.5081, + "step": 120, + "task_loss": 1.1405022144317627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5878621339797974, + "epoch": 0.1, + "learning_rate": 4.948858833474218e-05, + "loss": 0.4726, + "step": 121, + "task_loss": 0.25736790895462036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16709516942501068, + "epoch": 0.1, + "learning_rate": 4.94843617920541e-05, + "loss": 0.2761, + "step": 122, + "task_loss": 0.5165044665336609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22008250653743744, + "epoch": 0.1, + "learning_rate": 4.948013524936602e-05, + "loss": 0.2773, + "step": 123, + "task_loss": 0.8879871964454651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4000396728515625, + "epoch": 0.1, + "learning_rate": 4.947590870667794e-05, + "loss": 0.375, + "step": 124, + "task_loss": 0.7097489237785339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8170579671859741, + "epoch": 0.11, + "learning_rate": 4.9471682163989855e-05, + "loss": 0.4451, + "step": 125, + "task_loss": 0.4997040331363678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2630847692489624, + "epoch": 0.11, + "learning_rate": 4.946745562130178e-05, + "loss": 0.2828, + "step": 126, + "task_loss": 1.1715372800827026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3628414273262024, + "epoch": 0.11, + "learning_rate": 4.94632290786137e-05, + "loss": 0.3252, + "step": 127, + "task_loss": 0.3841911554336548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26722654700279236, + "epoch": 0.11, + "learning_rate": 4.9459002535925614e-05, + "loss": 0.4783, + "step": 128, + "task_loss": 2.1524083614349365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44716575741767883, + "epoch": 0.11, + "learning_rate": 4.9454775993237533e-05, + "loss": 0.4692, + "step": 129, + "task_loss": 1.471771240234375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6284704804420471, + "epoch": 0.11, + "learning_rate": 4.945054945054945e-05, + "loss": 0.3347, + "step": 130, + "task_loss": 0.25305724143981934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2816174030303955, + "epoch": 0.11, + "learning_rate": 4.944632290786137e-05, + "loss": 0.336, + "step": 131, + "task_loss": 1.7037070989608765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18777093291282654, + "epoch": 0.11, + "learning_rate": 4.944209636517329e-05, + "loss": 0.3004, + "step": 132, + "task_loss": 0.07924327254295349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49237024784088135, + "epoch": 0.11, + "learning_rate": 4.943786982248521e-05, + "loss": 0.4681, + "step": 133, + "task_loss": 1.0658358335494995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40326786041259766, + "epoch": 0.11, + "learning_rate": 4.9433643279797125e-05, + "loss": 0.4265, + "step": 134, + "task_loss": 1.5436773300170898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.13374197483062744, + "epoch": 0.11, + "learning_rate": 4.9429416737109045e-05, + "loss": 0.2439, + "step": 135, + "task_loss": 0.40028491616249084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4446583688259125, + "epoch": 0.11, + "learning_rate": 4.9425190194420965e-05, + "loss": 0.3879, + "step": 136, + "task_loss": 0.6489905714988708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2626723647117615, + "epoch": 0.12, + "learning_rate": 4.9420963651732885e-05, + "loss": 0.3771, + "step": 137, + "task_loss": 0.15167920291423798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.255208283662796, + "epoch": 0.12, + "learning_rate": 4.9416737109044804e-05, + "loss": 0.3497, + "step": 138, + "task_loss": 0.6664825081825256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25952035188674927, + "epoch": 0.12, + "learning_rate": 4.9412510566356724e-05, + "loss": 0.2659, + "step": 139, + "task_loss": 0.13492964208126068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3707650303840637, + "epoch": 0.12, + "learning_rate": 4.9408284023668644e-05, + "loss": 0.2525, + "step": 140, + "task_loss": 0.7116761803627014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21200241148471832, + "epoch": 0.12, + "learning_rate": 4.940405748098056e-05, + "loss": 0.3401, + "step": 141, + "task_loss": 0.7733478546142578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48759669065475464, + "epoch": 0.12, + "learning_rate": 4.9399830938292477e-05, + "loss": 0.3436, + "step": 142, + "task_loss": 1.0769908428192139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38488003611564636, + "epoch": 0.12, + "learning_rate": 4.93956043956044e-05, + "loss": 0.3408, + "step": 143, + "task_loss": 1.7284129858016968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2676427662372589, + "epoch": 0.12, + "learning_rate": 4.9391377852916316e-05, + "loss": 0.3083, + "step": 144, + "task_loss": 1.0192649364471436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21843518316745758, + "epoch": 0.12, + "learning_rate": 4.9387151310228236e-05, + "loss": 0.2872, + "step": 145, + "task_loss": 0.4018738865852356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6630820035934448, + "epoch": 0.12, + "learning_rate": 4.9382924767540155e-05, + "loss": 0.3742, + "step": 146, + "task_loss": 1.3242998123168945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28332826495170593, + "epoch": 0.12, + "learning_rate": 4.937869822485207e-05, + "loss": 0.4332, + "step": 147, + "task_loss": 0.5753346681594849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25695425271987915, + "epoch": 0.13, + "learning_rate": 4.9374471682163995e-05, + "loss": 0.315, + "step": 148, + "task_loss": 0.9013621211051941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25189918279647827, + "epoch": 0.13, + "learning_rate": 4.9370245139475915e-05, + "loss": 0.4817, + "step": 149, + "task_loss": 0.228720560669899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4760388135910034, + "epoch": 0.13, + "learning_rate": 4.936601859678783e-05, + "loss": 0.4217, + "step": 150, + "task_loss": 0.6111045479774475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4705445170402527, + "epoch": 0.13, + "learning_rate": 4.936179205409975e-05, + "loss": 0.5399, + "step": 151, + "task_loss": 1.0512099266052246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29746076464653015, + "epoch": 0.13, + "learning_rate": 4.935756551141167e-05, + "loss": 0.4229, + "step": 152, + "task_loss": 0.2905904948711395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3166247010231018, + "epoch": 0.13, + "learning_rate": 4.935333896872359e-05, + "loss": 0.4384, + "step": 153, + "task_loss": 0.6401548981666565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5391810536384583, + "epoch": 0.13, + "learning_rate": 4.934911242603551e-05, + "loss": 0.4406, + "step": 154, + "task_loss": 0.442170649766922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3679506778717041, + "epoch": 0.13, + "learning_rate": 4.9344885883347426e-05, + "loss": 0.4341, + "step": 155, + "task_loss": 0.7660558223724365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44703105092048645, + "epoch": 0.13, + "learning_rate": 4.9340659340659346e-05, + "loss": 0.3725, + "step": 156, + "task_loss": 0.31833499670028687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3736255466938019, + "epoch": 0.13, + "learning_rate": 4.933643279797126e-05, + "loss": 0.3678, + "step": 157, + "task_loss": 0.9408439993858337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5729640126228333, + "epoch": 0.13, + "learning_rate": 4.933220625528318e-05, + "loss": 0.5446, + "step": 158, + "task_loss": 0.3189557194709778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23309840261936188, + "epoch": 0.13, + "learning_rate": 4.93279797125951e-05, + "loss": 0.4131, + "step": 159, + "task_loss": 0.08921865373849869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24005305767059326, + "epoch": 0.14, + "learning_rate": 4.932375316990702e-05, + "loss": 0.2883, + "step": 160, + "task_loss": 0.696487307548523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.548248827457428, + "epoch": 0.14, + "learning_rate": 4.931952662721894e-05, + "loss": 0.3492, + "step": 161, + "task_loss": 0.8208967447280884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.09402120858430862, + "epoch": 0.14, + "learning_rate": 4.931530008453086e-05, + "loss": 0.2299, + "step": 162, + "task_loss": 0.01031099446117878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2072228193283081, + "epoch": 0.14, + "learning_rate": 4.931107354184277e-05, + "loss": 0.2668, + "step": 163, + "task_loss": 0.5715734958648682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4195344150066376, + "epoch": 0.14, + "learning_rate": 4.930684699915469e-05, + "loss": 0.3986, + "step": 164, + "task_loss": 0.8805137872695923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4367620348930359, + "epoch": 0.14, + "learning_rate": 4.930262045646662e-05, + "loss": 0.3233, + "step": 165, + "task_loss": 1.0261000394821167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4603279232978821, + "epoch": 0.14, + "learning_rate": 4.929839391377853e-05, + "loss": 0.4211, + "step": 166, + "task_loss": 0.8634194731712341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4213913381099701, + "epoch": 0.14, + "learning_rate": 4.929416737109045e-05, + "loss": 0.373, + "step": 167, + "task_loss": 1.4583817720413208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20449984073638916, + "epoch": 0.14, + "learning_rate": 4.928994082840237e-05, + "loss": 0.4052, + "step": 168, + "task_loss": 0.14410758018493652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23656447231769562, + "epoch": 0.14, + "learning_rate": 4.928571428571429e-05, + "loss": 0.2618, + "step": 169, + "task_loss": 0.05356917530298233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.244521826505661, + "epoch": 0.14, + "learning_rate": 4.928148774302621e-05, + "loss": 0.2705, + "step": 170, + "task_loss": 0.2933933436870575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30323511362075806, + "epoch": 0.14, + "learning_rate": 4.927726120033813e-05, + "loss": 0.2563, + "step": 171, + "task_loss": 0.49771153926849365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30565589666366577, + "epoch": 0.15, + "learning_rate": 4.927303465765005e-05, + "loss": 0.2989, + "step": 172, + "task_loss": 0.282042920589447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36005306243896484, + "epoch": 0.15, + "learning_rate": 4.926880811496196e-05, + "loss": 0.3026, + "step": 173, + "task_loss": 0.36329495906829834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30820009112358093, + "epoch": 0.15, + "learning_rate": 4.926458157227388e-05, + "loss": 0.3365, + "step": 174, + "task_loss": 1.2730638980865479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3730316460132599, + "epoch": 0.15, + "learning_rate": 4.92603550295858e-05, + "loss": 0.3356, + "step": 175, + "task_loss": 0.489788681268692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2243185043334961, + "epoch": 0.15, + "learning_rate": 4.925612848689772e-05, + "loss": 0.3381, + "step": 176, + "task_loss": 0.4047553837299347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3008669316768646, + "epoch": 0.15, + "learning_rate": 4.925190194420964e-05, + "loss": 0.2945, + "step": 177, + "task_loss": 0.466147780418396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19667010009288788, + "epoch": 0.15, + "learning_rate": 4.924767540152156e-05, + "loss": 0.2742, + "step": 178, + "task_loss": 0.5776610970497131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21384766697883606, + "epoch": 0.15, + "learning_rate": 4.924344885883347e-05, + "loss": 0.375, + "step": 179, + "task_loss": 0.3693738579750061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2983366847038269, + "epoch": 0.15, + "learning_rate": 4.923922231614539e-05, + "loss": 0.3567, + "step": 180, + "task_loss": 0.609382152557373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30045539140701294, + "epoch": 0.15, + "learning_rate": 4.923499577345731e-05, + "loss": 0.3851, + "step": 181, + "task_loss": 0.38646233081817627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2023361772298813, + "epoch": 0.15, + "learning_rate": 4.923076923076924e-05, + "loss": 0.2859, + "step": 182, + "task_loss": 0.8480668067932129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24268339574337006, + "epoch": 0.15, + "learning_rate": 4.922654268808115e-05, + "loss": 0.3398, + "step": 183, + "task_loss": 0.6510738134384155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23291978240013123, + "epoch": 0.16, + "learning_rate": 4.922231614539307e-05, + "loss": 0.3482, + "step": 184, + "task_loss": 1.3659625053405762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3567195236682892, + "epoch": 0.16, + "learning_rate": 4.921808960270499e-05, + "loss": 0.4689, + "step": 185, + "task_loss": 0.8014994859695435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.268853098154068, + "epoch": 0.16, + "learning_rate": 4.9213863060016904e-05, + "loss": 0.3964, + "step": 186, + "task_loss": 0.9685837030410767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.10869912803173065, + "epoch": 0.16, + "learning_rate": 4.920963651732883e-05, + "loss": 0.2751, + "step": 187, + "task_loss": 0.37290409207344055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25897130370140076, + "epoch": 0.16, + "learning_rate": 4.920540997464075e-05, + "loss": 0.3332, + "step": 188, + "task_loss": 0.43131357431411743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35233813524246216, + "epoch": 0.16, + "learning_rate": 4.9201183431952664e-05, + "loss": 0.3468, + "step": 189, + "task_loss": 1.5360071659088135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18634657561779022, + "epoch": 0.16, + "learning_rate": 4.919695688926458e-05, + "loss": 0.2161, + "step": 190, + "task_loss": 0.5493410229682922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.12940114736557007, + "epoch": 0.16, + "learning_rate": 4.91927303465765e-05, + "loss": 0.2754, + "step": 191, + "task_loss": 0.028904523700475693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3526163101196289, + "epoch": 0.16, + "learning_rate": 4.9188503803888416e-05, + "loss": 0.3042, + "step": 192, + "task_loss": 0.7353550791740417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35715556144714355, + "epoch": 0.16, + "learning_rate": 4.918427726120034e-05, + "loss": 0.3503, + "step": 193, + "task_loss": 0.39157605171203613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38395220041275024, + "epoch": 0.16, + "learning_rate": 4.918005071851226e-05, + "loss": 0.3159, + "step": 194, + "task_loss": 0.5014969110488892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4627022445201874, + "epoch": 0.16, + "learning_rate": 4.9175824175824175e-05, + "loss": 0.3327, + "step": 195, + "task_loss": 0.5290047526359558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20345164835453033, + "epoch": 0.17, + "learning_rate": 4.9171597633136095e-05, + "loss": 0.2885, + "step": 196, + "task_loss": 0.8092126250267029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6022716760635376, + "epoch": 0.17, + "learning_rate": 4.9167371090448015e-05, + "loss": 0.3501, + "step": 197, + "task_loss": 0.6652913689613342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4178740382194519, + "epoch": 0.17, + "learning_rate": 4.9163144547759934e-05, + "loss": 0.4446, + "step": 198, + "task_loss": 0.5839725732803345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24084614217281342, + "epoch": 0.17, + "learning_rate": 4.9158918005071854e-05, + "loss": 0.3208, + "step": 199, + "task_loss": 0.6247647404670715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5380869507789612, + "epoch": 0.17, + "learning_rate": 4.9154691462383774e-05, + "loss": 0.4439, + "step": 200, + "task_loss": 1.4319199323654175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28592145442962646, + "epoch": 0.17, + "learning_rate": 4.9150464919695694e-05, + "loss": 0.2889, + "step": 201, + "task_loss": 0.6194091439247131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26645004749298096, + "epoch": 0.17, + "learning_rate": 4.914623837700761e-05, + "loss": 0.307, + "step": 202, + "task_loss": 0.6286120414733887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16263580322265625, + "epoch": 0.17, + "learning_rate": 4.9142011834319526e-05, + "loss": 0.2924, + "step": 203, + "task_loss": 0.24746073782444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21327966451644897, + "epoch": 0.17, + "learning_rate": 4.913778529163145e-05, + "loss": 0.3467, + "step": 204, + "task_loss": 0.4032347500324249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5899863243103027, + "epoch": 0.17, + "learning_rate": 4.9133558748943366e-05, + "loss": 0.3852, + "step": 205, + "task_loss": 0.2813647389411926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15700814127922058, + "epoch": 0.17, + "learning_rate": 4.9129332206255286e-05, + "loss": 0.356, + "step": 206, + "task_loss": 0.19302156567573547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3096359670162201, + "epoch": 0.17, + "learning_rate": 4.9125105663567205e-05, + "loss": 0.356, + "step": 207, + "task_loss": 0.8887370824813843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19024589657783508, + "epoch": 0.18, + "learning_rate": 4.912087912087912e-05, + "loss": 0.2885, + "step": 208, + "task_loss": 0.7476159930229187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6751109957695007, + "epoch": 0.18, + "learning_rate": 4.911665257819104e-05, + "loss": 0.3892, + "step": 209, + "task_loss": 0.6961216926574707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22509407997131348, + "epoch": 0.18, + "learning_rate": 4.9112426035502965e-05, + "loss": 0.2569, + "step": 210, + "task_loss": 0.46005940437316895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20882007479667664, + "epoch": 0.18, + "learning_rate": 4.9108199492814884e-05, + "loss": 0.3174, + "step": 211, + "task_loss": 0.35927194356918335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23338523507118225, + "epoch": 0.18, + "learning_rate": 4.91039729501268e-05, + "loss": 0.2839, + "step": 212, + "task_loss": 0.5691384077072144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18294541537761688, + "epoch": 0.18, + "learning_rate": 4.909974640743872e-05, + "loss": 0.2884, + "step": 213, + "task_loss": 0.08627941459417343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24400398135185242, + "epoch": 0.18, + "learning_rate": 4.909551986475064e-05, + "loss": 0.3955, + "step": 214, + "task_loss": 1.446928858757019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2911604046821594, + "epoch": 0.18, + "learning_rate": 4.9091293322062556e-05, + "loss": 0.3205, + "step": 215, + "task_loss": 0.5143615007400513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28537502884864807, + "epoch": 0.18, + "learning_rate": 4.9087066779374476e-05, + "loss": 0.3005, + "step": 216, + "task_loss": 1.0182530879974365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5088628530502319, + "epoch": 0.18, + "learning_rate": 4.9082840236686396e-05, + "loss": 0.4478, + "step": 217, + "task_loss": 1.3089011907577515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38104909658432007, + "epoch": 0.18, + "learning_rate": 4.907861369399831e-05, + "loss": 0.3935, + "step": 218, + "task_loss": 0.4928361177444458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3787882328033447, + "epoch": 0.19, + "learning_rate": 4.907438715131023e-05, + "loss": 0.3754, + "step": 219, + "task_loss": 0.650814950466156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2550523579120636, + "epoch": 0.19, + "learning_rate": 4.907016060862215e-05, + "loss": 0.3334, + "step": 220, + "task_loss": 0.5834181904792786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2224728763103485, + "epoch": 0.19, + "learning_rate": 4.906593406593407e-05, + "loss": 0.2642, + "step": 221, + "task_loss": 0.3143998086452484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23832443356513977, + "epoch": 0.19, + "learning_rate": 4.906170752324599e-05, + "loss": 0.322, + "step": 222, + "task_loss": 1.4493589401245117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6176878213882446, + "epoch": 0.19, + "learning_rate": 4.905748098055791e-05, + "loss": 0.4383, + "step": 223, + "task_loss": 0.7622699737548828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2898561656475067, + "epoch": 0.19, + "learning_rate": 4.905325443786982e-05, + "loss": 0.3635, + "step": 224, + "task_loss": 0.5839920043945312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3696010708808899, + "epoch": 0.19, + "learning_rate": 4.904902789518174e-05, + "loss": 0.3661, + "step": 225, + "task_loss": 0.7432633638381958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20091712474822998, + "epoch": 0.19, + "learning_rate": 4.904480135249366e-05, + "loss": 0.2965, + "step": 226, + "task_loss": 0.32529860734939575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39854955673217773, + "epoch": 0.19, + "learning_rate": 4.9040574809805587e-05, + "loss": 0.4011, + "step": 227, + "task_loss": 0.40981027483940125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2788071036338806, + "epoch": 0.19, + "learning_rate": 4.90363482671175e-05, + "loss": 0.4525, + "step": 228, + "task_loss": 1.2623895406723022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46013692021369934, + "epoch": 0.19, + "learning_rate": 4.903212172442942e-05, + "loss": 0.5073, + "step": 229, + "task_loss": 1.6259897947311401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3463882505893707, + "epoch": 0.19, + "learning_rate": 4.902789518174134e-05, + "loss": 0.2795, + "step": 230, + "task_loss": 0.9291456937789917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29880496859550476, + "epoch": 0.2, + "learning_rate": 4.902366863905325e-05, + "loss": 0.3014, + "step": 231, + "task_loss": 0.7562724351882935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24067357182502747, + "epoch": 0.2, + "learning_rate": 4.901944209636518e-05, + "loss": 0.3351, + "step": 232, + "task_loss": 1.0021733045578003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.319251149892807, + "epoch": 0.2, + "learning_rate": 4.90152155536771e-05, + "loss": 0.2629, + "step": 233, + "task_loss": 0.45998692512512207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24535714089870453, + "epoch": 0.2, + "learning_rate": 4.901098901098901e-05, + "loss": 0.4441, + "step": 234, + "task_loss": 0.20303231477737427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2730843424797058, + "epoch": 0.2, + "learning_rate": 4.900676246830093e-05, + "loss": 0.3293, + "step": 235, + "task_loss": 0.5525544881820679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19751249253749847, + "epoch": 0.2, + "learning_rate": 4.900253592561285e-05, + "loss": 0.2836, + "step": 236, + "task_loss": 0.4372982382774353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30453741550445557, + "epoch": 0.2, + "learning_rate": 4.899830938292477e-05, + "loss": 0.3706, + "step": 237, + "task_loss": 1.265528917312622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1653909683227539, + "epoch": 0.2, + "learning_rate": 4.899408284023669e-05, + "loss": 0.4835, + "step": 238, + "task_loss": 1.224724292755127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3479859530925751, + "epoch": 0.2, + "learning_rate": 4.898985629754861e-05, + "loss": 0.3989, + "step": 239, + "task_loss": 0.655051052570343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20435407757759094, + "epoch": 0.2, + "learning_rate": 4.898562975486053e-05, + "loss": 0.2831, + "step": 240, + "task_loss": 0.28512734174728394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4552597403526306, + "epoch": 0.2, + "learning_rate": 4.898140321217244e-05, + "loss": 0.3285, + "step": 241, + "task_loss": 0.4700802266597748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24903175234794617, + "epoch": 0.2, + "learning_rate": 4.897717666948436e-05, + "loss": 0.3497, + "step": 242, + "task_loss": 0.401705801486969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33746951818466187, + "epoch": 0.21, + "learning_rate": 4.897295012679628e-05, + "loss": 0.3429, + "step": 243, + "task_loss": 0.26708200573921204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3835751712322235, + "epoch": 0.21, + "learning_rate": 4.89687235841082e-05, + "loss": 0.2924, + "step": 244, + "task_loss": 0.7240639925003052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3886156678199768, + "epoch": 0.21, + "learning_rate": 4.896449704142012e-05, + "loss": 0.3834, + "step": 245, + "task_loss": 0.26518186926841736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30780693888664246, + "epoch": 0.21, + "learning_rate": 4.896027049873204e-05, + "loss": 0.3604, + "step": 246, + "task_loss": 0.5353126525878906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2051931768655777, + "epoch": 0.21, + "learning_rate": 4.8956043956043954e-05, + "loss": 0.3104, + "step": 247, + "task_loss": 0.45676663517951965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25273922085762024, + "epoch": 0.21, + "learning_rate": 4.8951817413355874e-05, + "loss": 0.3145, + "step": 248, + "task_loss": 0.20090925693511963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5339232683181763, + "epoch": 0.21, + "learning_rate": 4.89475908706678e-05, + "loss": 0.4151, + "step": 249, + "task_loss": 0.4397232234477997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49212467670440674, + "epoch": 0.21, + "learning_rate": 4.8943364327979713e-05, + "loss": 0.3511, + "step": 250, + "task_loss": 0.25163865089416504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5351613163948059, + "epoch": 0.21, + "learning_rate": 4.893913778529163e-05, + "loss": 0.3722, + "step": 251, + "task_loss": 1.080898642539978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2241917848587036, + "epoch": 0.21, + "learning_rate": 4.893491124260355e-05, + "loss": 0.3855, + "step": 252, + "task_loss": 0.7277026176452637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23997260630130768, + "epoch": 0.21, + "learning_rate": 4.8930684699915466e-05, + "loss": 0.4527, + "step": 253, + "task_loss": 0.6579324007034302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41035687923431396, + "epoch": 0.21, + "learning_rate": 4.892645815722739e-05, + "loss": 0.2967, + "step": 254, + "task_loss": 1.1071937084197998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4491110146045685, + "epoch": 0.22, + "learning_rate": 4.892223161453931e-05, + "loss": 0.3185, + "step": 255, + "task_loss": 0.6341534852981567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31548792123794556, + "epoch": 0.22, + "learning_rate": 4.891800507185123e-05, + "loss": 0.3219, + "step": 256, + "task_loss": 0.6876879930496216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3046998679637909, + "epoch": 0.22, + "learning_rate": 4.8913778529163145e-05, + "loss": 0.3781, + "step": 257, + "task_loss": 0.5512405633926392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35629910230636597, + "epoch": 0.22, + "learning_rate": 4.8909551986475065e-05, + "loss": 0.4538, + "step": 258, + "task_loss": 0.9044925570487976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47341591119766235, + "epoch": 0.22, + "learning_rate": 4.8905325443786984e-05, + "loss": 0.3629, + "step": 259, + "task_loss": 1.0995041131973267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16987548768520355, + "epoch": 0.22, + "learning_rate": 4.8901098901098904e-05, + "loss": 0.2282, + "step": 260, + "task_loss": 0.08861540257930756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3050326704978943, + "epoch": 0.22, + "learning_rate": 4.8896872358410824e-05, + "loss": 0.3364, + "step": 261, + "task_loss": 1.1310858726501465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16695277392864227, + "epoch": 0.22, + "learning_rate": 4.8892645815722744e-05, + "loss": 0.3632, + "step": 262, + "task_loss": 0.4487459659576416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46074435114860535, + "epoch": 0.22, + "learning_rate": 4.8888419273034656e-05, + "loss": 0.3622, + "step": 263, + "task_loss": 0.5444375872612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1946798712015152, + "epoch": 0.22, + "learning_rate": 4.8884192730346576e-05, + "loss": 0.253, + "step": 264, + "task_loss": 0.38776695728302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2863951325416565, + "epoch": 0.22, + "learning_rate": 4.8879966187658496e-05, + "loss": 0.3384, + "step": 265, + "task_loss": 0.5527970194816589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20850254595279694, + "epoch": 0.22, + "learning_rate": 4.8875739644970416e-05, + "loss": 0.3121, + "step": 266, + "task_loss": 0.21559667587280273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3922833800315857, + "epoch": 0.23, + "learning_rate": 4.8871513102282335e-05, + "loss": 0.3714, + "step": 267, + "task_loss": 1.0098217725753784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33430397510528564, + "epoch": 0.23, + "learning_rate": 4.8867286559594255e-05, + "loss": 0.3824, + "step": 268, + "task_loss": 0.7453335523605347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7871788740158081, + "epoch": 0.23, + "learning_rate": 4.8863060016906175e-05, + "loss": 0.5347, + "step": 269, + "task_loss": 0.6393784284591675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38466086983680725, + "epoch": 0.23, + "learning_rate": 4.885883347421809e-05, + "loss": 0.2949, + "step": 270, + "task_loss": 0.32888132333755493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3655931055545807, + "epoch": 0.23, + "learning_rate": 4.8854606931530014e-05, + "loss": 0.3399, + "step": 271, + "task_loss": 1.2650548219680786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.199500173330307, + "epoch": 0.23, + "learning_rate": 4.8850380388841934e-05, + "loss": 0.4179, + "step": 272, + "task_loss": 0.200534850358963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20676256716251373, + "epoch": 0.23, + "learning_rate": 4.884615384615385e-05, + "loss": 0.3105, + "step": 273, + "task_loss": 0.13629695773124695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35268110036849976, + "epoch": 0.23, + "learning_rate": 4.884192730346577e-05, + "loss": 0.3439, + "step": 274, + "task_loss": 0.3449114263057709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19986888766288757, + "epoch": 0.23, + "learning_rate": 4.8837700760777687e-05, + "loss": 0.2483, + "step": 275, + "task_loss": 0.08239832520484924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36058855056762695, + "epoch": 0.23, + "learning_rate": 4.8833474218089606e-05, + "loss": 0.3632, + "step": 276, + "task_loss": 0.344508558511734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17777377367019653, + "epoch": 0.23, + "learning_rate": 4.8829247675401526e-05, + "loss": 0.2685, + "step": 277, + "task_loss": 0.16075986623764038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31643208861351013, + "epoch": 0.23, + "learning_rate": 4.8825021132713446e-05, + "loss": 0.263, + "step": 278, + "task_loss": 0.30614912509918213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24418959021568298, + "epoch": 0.24, + "learning_rate": 4.882079459002536e-05, + "loss": 0.3307, + "step": 279, + "task_loss": 0.44672325253486633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3086036145687103, + "epoch": 0.24, + "learning_rate": 4.881656804733728e-05, + "loss": 0.361, + "step": 280, + "task_loss": 0.5483068227767944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2502906322479248, + "epoch": 0.24, + "learning_rate": 4.88123415046492e-05, + "loss": 0.3194, + "step": 281, + "task_loss": 0.7591536045074463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5221872329711914, + "epoch": 0.24, + "learning_rate": 4.880811496196112e-05, + "loss": 0.3584, + "step": 282, + "task_loss": 0.8819484114646912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32768136262893677, + "epoch": 0.24, + "learning_rate": 4.880388841927304e-05, + "loss": 0.4272, + "step": 283, + "task_loss": 0.9988192915916443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3342861235141754, + "epoch": 0.24, + "learning_rate": 4.879966187658496e-05, + "loss": 0.3513, + "step": 284, + "task_loss": 0.6095466613769531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29538947343826294, + "epoch": 0.24, + "learning_rate": 4.879543533389688e-05, + "loss": 0.2872, + "step": 285, + "task_loss": 0.6634162664413452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17239470779895782, + "epoch": 0.24, + "learning_rate": 4.879120879120879e-05, + "loss": 0.3073, + "step": 286, + "task_loss": 0.2715796232223511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34022772312164307, + "epoch": 0.24, + "learning_rate": 4.878698224852071e-05, + "loss": 0.3572, + "step": 287, + "task_loss": 0.46570080518722534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.586940348148346, + "epoch": 0.24, + "learning_rate": 4.8782755705832636e-05, + "loss": 0.4532, + "step": 288, + "task_loss": 0.28270670771598816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3450503945350647, + "epoch": 0.24, + "learning_rate": 4.877852916314455e-05, + "loss": 0.3776, + "step": 289, + "task_loss": 1.6055642366409302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23230475187301636, + "epoch": 0.24, + "learning_rate": 4.877430262045647e-05, + "loss": 0.3155, + "step": 290, + "task_loss": 0.7673444151878357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5032759308815002, + "epoch": 0.25, + "learning_rate": 4.877007607776839e-05, + "loss": 0.4087, + "step": 291, + "task_loss": 0.4153584837913513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20205602049827576, + "epoch": 0.25, + "learning_rate": 4.87658495350803e-05, + "loss": 0.279, + "step": 292, + "task_loss": 0.2720683217048645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3824338912963867, + "epoch": 0.25, + "learning_rate": 4.876162299239223e-05, + "loss": 0.3928, + "step": 293, + "task_loss": 0.43445995450019836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39851096272468567, + "epoch": 0.25, + "learning_rate": 4.875739644970415e-05, + "loss": 0.383, + "step": 294, + "task_loss": 0.47297611832618713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23739580810070038, + "epoch": 0.25, + "learning_rate": 4.875316990701606e-05, + "loss": 0.288, + "step": 295, + "task_loss": 0.28086382150650024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3505229949951172, + "epoch": 0.25, + "learning_rate": 4.874894336432798e-05, + "loss": 0.3839, + "step": 296, + "task_loss": 0.9735152125358582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31758180260658264, + "epoch": 0.25, + "learning_rate": 4.87447168216399e-05, + "loss": 0.3954, + "step": 297, + "task_loss": 0.746353805065155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5545935034751892, + "epoch": 0.25, + "learning_rate": 4.874049027895182e-05, + "loss": 0.4143, + "step": 298, + "task_loss": 0.7208579182624817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25441351532936096, + "epoch": 0.25, + "learning_rate": 4.873626373626374e-05, + "loss": 0.3798, + "step": 299, + "task_loss": 0.39883047342300415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4335569739341736, + "epoch": 0.25, + "learning_rate": 4.873203719357566e-05, + "loss": 0.4675, + "step": 300, + "task_loss": 0.5771480798721313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.200557142496109, + "epoch": 0.25, + "learning_rate": 4.872781065088758e-05, + "loss": 0.3159, + "step": 301, + "task_loss": 0.08480185270309448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2761476933956146, + "epoch": 0.26, + "learning_rate": 4.872358410819949e-05, + "loss": 0.3051, + "step": 302, + "task_loss": 0.6513972282409668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2249608039855957, + "epoch": 0.26, + "learning_rate": 4.871935756551141e-05, + "loss": 0.3888, + "step": 303, + "task_loss": 0.5342172980308533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4718804359436035, + "epoch": 0.26, + "learning_rate": 4.871513102282333e-05, + "loss": 0.3778, + "step": 304, + "task_loss": 1.6125479936599731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7248421311378479, + "epoch": 0.26, + "learning_rate": 4.871090448013525e-05, + "loss": 0.4007, + "step": 305, + "task_loss": 0.9928491711616516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23324015736579895, + "epoch": 0.26, + "learning_rate": 4.870667793744717e-05, + "loss": 0.3242, + "step": 306, + "task_loss": 0.18504105508327484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33958256244659424, + "epoch": 0.26, + "learning_rate": 4.870245139475909e-05, + "loss": 0.3021, + "step": 307, + "task_loss": 0.9580204486846924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3680446147918701, + "epoch": 0.26, + "learning_rate": 4.8698224852071004e-05, + "loss": 0.3885, + "step": 308, + "task_loss": 1.010891079902649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4460088610649109, + "epoch": 0.26, + "learning_rate": 4.8693998309382924e-05, + "loss": 0.3097, + "step": 309, + "task_loss": 0.44874387979507446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2667539715766907, + "epoch": 0.26, + "learning_rate": 4.868977176669485e-05, + "loss": 0.4331, + "step": 310, + "task_loss": 1.2887728214263916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1914120465517044, + "epoch": 0.26, + "learning_rate": 4.868554522400676e-05, + "loss": 0.4491, + "step": 311, + "task_loss": 0.14170601963996887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23233111202716827, + "epoch": 0.26, + "learning_rate": 4.868131868131868e-05, + "loss": 0.2712, + "step": 312, + "task_loss": 0.27271774411201477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27626264095306396, + "epoch": 0.26, + "learning_rate": 4.86770921386306e-05, + "loss": 0.2319, + "step": 313, + "task_loss": 0.6159605979919434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.231930673122406, + "epoch": 0.27, + "learning_rate": 4.867286559594252e-05, + "loss": 0.3125, + "step": 314, + "task_loss": 0.47959962487220764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35934150218963623, + "epoch": 0.27, + "learning_rate": 4.866863905325444e-05, + "loss": 0.3128, + "step": 315, + "task_loss": 0.6822903156280518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2281390279531479, + "epoch": 0.27, + "learning_rate": 4.866441251056636e-05, + "loss": 0.3146, + "step": 316, + "task_loss": 0.1894877403974533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4440813660621643, + "epoch": 0.27, + "learning_rate": 4.866018596787828e-05, + "loss": 0.4482, + "step": 317, + "task_loss": 1.048689365386963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1617339700460434, + "epoch": 0.27, + "learning_rate": 4.8655959425190195e-05, + "loss": 0.303, + "step": 318, + "task_loss": 0.5686681270599365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38979366421699524, + "epoch": 0.27, + "learning_rate": 4.8651732882502114e-05, + "loss": 0.3538, + "step": 319, + "task_loss": 0.605825662612915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3023730218410492, + "epoch": 0.27, + "learning_rate": 4.8647506339814034e-05, + "loss": 0.2528, + "step": 320, + "task_loss": 0.5307630300521851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3410377502441406, + "epoch": 0.27, + "learning_rate": 4.8643279797125954e-05, + "loss": 0.4559, + "step": 321, + "task_loss": 1.0144431591033936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29597872495651245, + "epoch": 0.27, + "learning_rate": 4.8639053254437874e-05, + "loss": 0.3011, + "step": 322, + "task_loss": 0.5639635324478149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5708921551704407, + "epoch": 0.27, + "learning_rate": 4.863482671174979e-05, + "loss": 0.3387, + "step": 323, + "task_loss": 1.2709810733795166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3579939007759094, + "epoch": 0.27, + "learning_rate": 4.8630600169061706e-05, + "loss": 0.3968, + "step": 324, + "task_loss": 0.6427634358406067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31119680404663086, + "epoch": 0.27, + "learning_rate": 4.8626373626373626e-05, + "loss": 0.4006, + "step": 325, + "task_loss": 0.3410301208496094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3839254677295685, + "epoch": 0.28, + "learning_rate": 4.8622147083685546e-05, + "loss": 0.2692, + "step": 326, + "task_loss": 0.5920546650886536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31953269243240356, + "epoch": 0.28, + "learning_rate": 4.861792054099747e-05, + "loss": 0.2991, + "step": 327, + "task_loss": 0.9777121543884277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19304096698760986, + "epoch": 0.28, + "learning_rate": 4.8613693998309385e-05, + "loss": 0.3123, + "step": 328, + "task_loss": 0.5040408968925476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2374301254749298, + "epoch": 0.28, + "learning_rate": 4.8609467455621305e-05, + "loss": 0.4676, + "step": 329, + "task_loss": 0.4793791174888611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49753913283348083, + "epoch": 0.28, + "learning_rate": 4.8605240912933225e-05, + "loss": 0.4019, + "step": 330, + "task_loss": 0.3474293649196625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4127744436264038, + "epoch": 0.28, + "learning_rate": 4.860101437024514e-05, + "loss": 0.6179, + "step": 331, + "task_loss": 0.3667612075805664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.508611798286438, + "epoch": 0.28, + "learning_rate": 4.8596787827557064e-05, + "loss": 0.3667, + "step": 332, + "task_loss": 0.5460253357887268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3820558786392212, + "epoch": 0.28, + "learning_rate": 4.8592561284868984e-05, + "loss": 0.3105, + "step": 333, + "task_loss": 0.3513747751712799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26280924677848816, + "epoch": 0.28, + "learning_rate": 4.85883347421809e-05, + "loss": 0.3708, + "step": 334, + "task_loss": 0.702881932258606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3734799027442932, + "epoch": 0.28, + "learning_rate": 4.858410819949282e-05, + "loss": 0.2676, + "step": 335, + "task_loss": 0.6505433917045593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19496524333953857, + "epoch": 0.28, + "learning_rate": 4.8579881656804736e-05, + "loss": 0.2688, + "step": 336, + "task_loss": 0.0567777082324028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30779075622558594, + "epoch": 0.28, + "learning_rate": 4.857565511411665e-05, + "loss": 0.3478, + "step": 337, + "task_loss": 0.9644184112548828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3214837312698364, + "epoch": 0.29, + "learning_rate": 4.8571428571428576e-05, + "loss": 0.381, + "step": 338, + "task_loss": 0.8313199877738953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40075811743736267, + "epoch": 0.29, + "learning_rate": 4.8567202028740496e-05, + "loss": 0.3495, + "step": 339, + "task_loss": 0.6049473881721497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23903967440128326, + "epoch": 0.29, + "learning_rate": 4.856297548605241e-05, + "loss": 0.2891, + "step": 340, + "task_loss": 0.9305899739265442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2694218158721924, + "epoch": 0.29, + "learning_rate": 4.855874894336433e-05, + "loss": 0.3745, + "step": 341, + "task_loss": 0.45736974477767944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23542985320091248, + "epoch": 0.29, + "learning_rate": 4.855452240067625e-05, + "loss": 0.381, + "step": 342, + "task_loss": 0.1643262803554535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30288824439048767, + "epoch": 0.29, + "learning_rate": 4.855029585798817e-05, + "loss": 0.3187, + "step": 343, + "task_loss": 0.4931211769580841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4340369403362274, + "epoch": 0.29, + "learning_rate": 4.854606931530009e-05, + "loss": 0.4145, + "step": 344, + "task_loss": 0.9261170029640198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2673012316226959, + "epoch": 0.29, + "learning_rate": 4.854184277261201e-05, + "loss": 0.307, + "step": 345, + "task_loss": 0.9187374114990234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25153711438179016, + "epoch": 0.29, + "learning_rate": 4.853761622992393e-05, + "loss": 0.2396, + "step": 346, + "task_loss": 0.3297625482082367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48461300134658813, + "epoch": 0.29, + "learning_rate": 4.853338968723584e-05, + "loss": 0.3127, + "step": 347, + "task_loss": 0.8848038911819458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32733801007270813, + "epoch": 0.29, + "learning_rate": 4.852916314454776e-05, + "loss": 0.3665, + "step": 348, + "task_loss": 0.5857535004615784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39943447709083557, + "epoch": 0.29, + "learning_rate": 4.8524936601859686e-05, + "loss": 0.4085, + "step": 349, + "task_loss": 0.06979019939899445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26431018114089966, + "epoch": 0.3, + "learning_rate": 4.85207100591716e-05, + "loss": 0.4182, + "step": 350, + "task_loss": 0.16538062691688538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21030639111995697, + "epoch": 0.3, + "learning_rate": 4.851648351648352e-05, + "loss": 0.3238, + "step": 351, + "task_loss": 0.5685637593269348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2704937160015106, + "epoch": 0.3, + "learning_rate": 4.851225697379544e-05, + "loss": 0.3425, + "step": 352, + "task_loss": 0.9471548199653625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3419402241706848, + "epoch": 0.3, + "learning_rate": 4.850803043110735e-05, + "loss": 0.3797, + "step": 353, + "task_loss": 0.4429088532924652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34285545349121094, + "epoch": 0.3, + "learning_rate": 4.850380388841927e-05, + "loss": 0.4639, + "step": 354, + "task_loss": 0.16208268702030182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17453980445861816, + "epoch": 0.3, + "learning_rate": 4.84995773457312e-05, + "loss": 0.3265, + "step": 355, + "task_loss": 0.1706947535276413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.12912121415138245, + "epoch": 0.3, + "learning_rate": 4.849535080304311e-05, + "loss": 0.3914, + "step": 356, + "task_loss": 0.8993921875953674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2798566222190857, + "epoch": 0.3, + "learning_rate": 4.849112426035503e-05, + "loss": 0.2617, + "step": 357, + "task_loss": 0.2404172122478485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38516366481781006, + "epoch": 0.3, + "learning_rate": 4.848689771766695e-05, + "loss": 0.3581, + "step": 358, + "task_loss": 0.7003974318504333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3995153307914734, + "epoch": 0.3, + "learning_rate": 4.848267117497887e-05, + "loss": 0.3755, + "step": 359, + "task_loss": 0.6110330820083618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5429834723472595, + "epoch": 0.3, + "learning_rate": 4.847844463229079e-05, + "loss": 0.3508, + "step": 360, + "task_loss": 0.4747626781463623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5203607082366943, + "epoch": 0.3, + "learning_rate": 4.847421808960271e-05, + "loss": 0.4817, + "step": 361, + "task_loss": 0.9935450553894043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5610111951828003, + "epoch": 0.31, + "learning_rate": 4.846999154691463e-05, + "loss": 0.3601, + "step": 362, + "task_loss": 0.6307058334350586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26061660051345825, + "epoch": 0.31, + "learning_rate": 4.846576500422654e-05, + "loss": 0.4462, + "step": 363, + "task_loss": 0.7939696907997131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2086484432220459, + "epoch": 0.31, + "learning_rate": 4.846153846153846e-05, + "loss": 0.4501, + "step": 364, + "task_loss": 1.2991604804992676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2818431258201599, + "epoch": 0.31, + "learning_rate": 4.845731191885038e-05, + "loss": 0.2959, + "step": 365, + "task_loss": 0.15905030071735382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2016371339559555, + "epoch": 0.31, + "learning_rate": 4.84530853761623e-05, + "loss": 0.3659, + "step": 366, + "task_loss": 0.34598076343536377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32798513770103455, + "epoch": 0.31, + "learning_rate": 4.844885883347422e-05, + "loss": 0.3912, + "step": 367, + "task_loss": 0.8429501056671143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41197237372398376, + "epoch": 0.31, + "learning_rate": 4.844463229078614e-05, + "loss": 0.4, + "step": 368, + "task_loss": 0.6444922685623169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26251524686813354, + "epoch": 0.31, + "learning_rate": 4.8440405748098054e-05, + "loss": 0.3193, + "step": 369, + "task_loss": 0.41190579533576965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6477534770965576, + "epoch": 0.31, + "learning_rate": 4.8436179205409974e-05, + "loss": 0.5108, + "step": 370, + "task_loss": 0.6784238815307617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.268083393573761, + "epoch": 0.31, + "learning_rate": 4.843195266272189e-05, + "loss": 0.3435, + "step": 371, + "task_loss": 0.0465749129652977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17515163123607635, + "epoch": 0.31, + "learning_rate": 4.842772612003382e-05, + "loss": 0.318, + "step": 372, + "task_loss": 0.5904419422149658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21761217713356018, + "epoch": 0.32, + "learning_rate": 4.842349957734573e-05, + "loss": 0.2948, + "step": 373, + "task_loss": 0.47346076369285583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14544571936130524, + "epoch": 0.32, + "learning_rate": 4.841927303465765e-05, + "loss": 0.3477, + "step": 374, + "task_loss": 0.17858652770519257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2630438506603241, + "epoch": 0.32, + "learning_rate": 4.841504649196957e-05, + "loss": 0.3459, + "step": 375, + "task_loss": 0.3764711916446686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4630126953125, + "epoch": 0.32, + "learning_rate": 4.8410819949281485e-05, + "loss": 0.3812, + "step": 376, + "task_loss": 0.3913819491863251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21063268184661865, + "epoch": 0.32, + "learning_rate": 4.840659340659341e-05, + "loss": 0.3403, + "step": 377, + "task_loss": 0.29857826232910156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4019916355609894, + "epoch": 0.32, + "learning_rate": 4.840236686390533e-05, + "loss": 0.3179, + "step": 378, + "task_loss": 0.383775532245636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4334412217140198, + "epoch": 0.32, + "learning_rate": 4.8398140321217245e-05, + "loss": 0.2813, + "step": 379, + "task_loss": 0.09898050129413605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27811628580093384, + "epoch": 0.32, + "learning_rate": 4.8393913778529164e-05, + "loss": 0.3137, + "step": 380, + "task_loss": 0.40869438648223877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5298483371734619, + "epoch": 0.32, + "learning_rate": 4.8389687235841084e-05, + "loss": 0.395, + "step": 381, + "task_loss": 0.770438015460968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2107640653848648, + "epoch": 0.32, + "learning_rate": 4.8385460693153004e-05, + "loss": 0.336, + "step": 382, + "task_loss": 0.475759357213974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4724220335483551, + "epoch": 0.32, + "learning_rate": 4.8381234150464923e-05, + "loss": 0.3241, + "step": 383, + "task_loss": 1.1706597805023193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33152490854263306, + "epoch": 0.32, + "learning_rate": 4.837700760777684e-05, + "loss": 0.3438, + "step": 384, + "task_loss": 0.24386684596538544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4147869646549225, + "epoch": 0.33, + "learning_rate": 4.8372781065088756e-05, + "loss": 0.418, + "step": 385, + "task_loss": 0.7812007665634155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20956742763519287, + "epoch": 0.33, + "learning_rate": 4.8368554522400676e-05, + "loss": 0.3443, + "step": 386, + "task_loss": 0.2561536133289337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4003746509552002, + "epoch": 0.33, + "learning_rate": 4.8364327979712596e-05, + "loss": 0.4963, + "step": 387, + "task_loss": 0.5156252980232239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.482105016708374, + "epoch": 0.33, + "learning_rate": 4.8360101437024515e-05, + "loss": 0.4836, + "step": 388, + "task_loss": 0.6523312330245972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47945621609687805, + "epoch": 0.33, + "learning_rate": 4.8355874894336435e-05, + "loss": 0.3995, + "step": 389, + "task_loss": 0.8960033655166626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3446442186832428, + "epoch": 0.33, + "learning_rate": 4.8351648351648355e-05, + "loss": 0.4004, + "step": 390, + "task_loss": 0.7979370951652527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45113879442214966, + "epoch": 0.33, + "learning_rate": 4.8347421808960275e-05, + "loss": 0.2624, + "step": 391, + "task_loss": 0.980656623840332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39476102590560913, + "epoch": 0.33, + "learning_rate": 4.834319526627219e-05, + "loss": 0.5158, + "step": 392, + "task_loss": 0.29816144704818726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25753751397132874, + "epoch": 0.33, + "learning_rate": 4.833896872358411e-05, + "loss": 0.3639, + "step": 393, + "task_loss": 1.2662850618362427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.302455335855484, + "epoch": 0.33, + "learning_rate": 4.8334742180896034e-05, + "loss": 0.3695, + "step": 394, + "task_loss": 0.6532285809516907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19427794218063354, + "epoch": 0.33, + "learning_rate": 4.833051563820795e-05, + "loss": 0.2602, + "step": 395, + "task_loss": 0.607721209526062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28633415699005127, + "epoch": 0.33, + "learning_rate": 4.8326289095519867e-05, + "loss": 0.3319, + "step": 396, + "task_loss": 0.4353885352611542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25589054822921753, + "epoch": 0.34, + "learning_rate": 4.8322062552831786e-05, + "loss": 0.2729, + "step": 397, + "task_loss": 0.45632249116897583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5806112885475159, + "epoch": 0.34, + "learning_rate": 4.83178360101437e-05, + "loss": 0.4, + "step": 398, + "task_loss": 0.6158548593521118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21340397000312805, + "epoch": 0.34, + "learning_rate": 4.8313609467455626e-05, + "loss": 0.3041, + "step": 399, + "task_loss": 0.5302818417549133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4214976131916046, + "epoch": 0.34, + "learning_rate": 4.8309382924767545e-05, + "loss": 0.3126, + "step": 400, + "task_loss": 1.0148353576660156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5677328705787659, + "epoch": 0.34, + "learning_rate": 4.8305156382079465e-05, + "loss": 0.3985, + "step": 401, + "task_loss": 1.1190402507781982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43479910492897034, + "epoch": 0.34, + "learning_rate": 4.830092983939138e-05, + "loss": 0.2808, + "step": 402, + "task_loss": 0.7361778020858765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3868566155433655, + "epoch": 0.34, + "learning_rate": 4.82967032967033e-05, + "loss": 0.3044, + "step": 403, + "task_loss": 0.21431250870227814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3633410930633545, + "epoch": 0.34, + "learning_rate": 4.829247675401522e-05, + "loss": 0.4023, + "step": 404, + "task_loss": 1.0183416604995728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31340885162353516, + "epoch": 0.34, + "learning_rate": 4.828825021132714e-05, + "loss": 0.3294, + "step": 405, + "task_loss": 0.3995034694671631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22247859835624695, + "epoch": 0.34, + "learning_rate": 4.828402366863906e-05, + "loss": 0.3844, + "step": 406, + "task_loss": 0.4827563464641571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3401729464530945, + "epoch": 0.34, + "learning_rate": 4.827979712595098e-05, + "loss": 0.409, + "step": 407, + "task_loss": 0.37865233421325684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23676316440105438, + "epoch": 0.34, + "learning_rate": 4.827557058326289e-05, + "loss": 0.2726, + "step": 408, + "task_loss": 0.5899599194526672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47724971175193787, + "epoch": 0.35, + "learning_rate": 4.827134404057481e-05, + "loss": 0.4291, + "step": 409, + "task_loss": 0.34716418385505676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20625603199005127, + "epoch": 0.35, + "learning_rate": 4.826711749788673e-05, + "loss": 0.2917, + "step": 410, + "task_loss": 0.10252900421619415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5770359039306641, + "epoch": 0.35, + "learning_rate": 4.826289095519865e-05, + "loss": 0.3665, + "step": 411, + "task_loss": 0.7841318845748901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.12576042115688324, + "epoch": 0.35, + "learning_rate": 4.825866441251057e-05, + "loss": 0.3062, + "step": 412, + "task_loss": 0.005407290533185005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16354574263095856, + "epoch": 0.35, + "learning_rate": 4.825443786982249e-05, + "loss": 0.2629, + "step": 413, + "task_loss": 0.6066843867301941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18714019656181335, + "epoch": 0.35, + "learning_rate": 4.82502113271344e-05, + "loss": 0.4209, + "step": 414, + "task_loss": 1.005037784576416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7463491559028625, + "epoch": 0.35, + "learning_rate": 4.824598478444632e-05, + "loss": 0.4831, + "step": 415, + "task_loss": 0.5606073141098022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35003143548965454, + "epoch": 0.35, + "learning_rate": 4.824175824175825e-05, + "loss": 0.3597, + "step": 416, + "task_loss": 0.030992213636636734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35349225997924805, + "epoch": 0.35, + "learning_rate": 4.823753169907017e-05, + "loss": 0.4748, + "step": 417, + "task_loss": 0.12775585055351257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3403266966342926, + "epoch": 0.35, + "learning_rate": 4.823330515638208e-05, + "loss": 0.3222, + "step": 418, + "task_loss": 0.5558363199234009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3834650218486786, + "epoch": 0.35, + "learning_rate": 4.8229078613694e-05, + "loss": 0.3528, + "step": 419, + "task_loss": 0.7809251546859741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24055622518062592, + "epoch": 0.35, + "learning_rate": 4.822485207100592e-05, + "loss": 0.416, + "step": 420, + "task_loss": 0.8119294047355652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34761524200439453, + "epoch": 0.36, + "learning_rate": 4.822062552831784e-05, + "loss": 0.3734, + "step": 421, + "task_loss": 0.5272291302680969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17843936383724213, + "epoch": 0.36, + "learning_rate": 4.821639898562976e-05, + "loss": 0.2783, + "step": 422, + "task_loss": 0.09369053691625595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14937134087085724, + "epoch": 0.36, + "learning_rate": 4.821217244294168e-05, + "loss": 0.2942, + "step": 423, + "task_loss": 0.5208826065063477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3691389262676239, + "epoch": 0.36, + "learning_rate": 4.820794590025359e-05, + "loss": 0.339, + "step": 424, + "task_loss": 0.6015507578849792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.12653256952762604, + "epoch": 0.36, + "learning_rate": 4.820371935756551e-05, + "loss": 0.2732, + "step": 425, + "task_loss": 0.09884674102067947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3428172171115875, + "epoch": 0.36, + "learning_rate": 4.819949281487743e-05, + "loss": 0.4267, + "step": 426, + "task_loss": 1.3080663681030273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3141721785068512, + "epoch": 0.36, + "learning_rate": 4.819526627218935e-05, + "loss": 0.3324, + "step": 427, + "task_loss": 2.044431447982788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24937307834625244, + "epoch": 0.36, + "learning_rate": 4.819103972950127e-05, + "loss": 0.4084, + "step": 428, + "task_loss": 1.371000051498413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24519547820091248, + "epoch": 0.36, + "learning_rate": 4.818681318681319e-05, + "loss": 0.4358, + "step": 429, + "task_loss": 0.9066499471664429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20103178918361664, + "epoch": 0.36, + "learning_rate": 4.818258664412511e-05, + "loss": 0.3936, + "step": 430, + "task_loss": 1.0921999216079712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37281349301338196, + "epoch": 0.36, + "learning_rate": 4.8178360101437023e-05, + "loss": 0.4536, + "step": 431, + "task_loss": 0.82737135887146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2910953760147095, + "epoch": 0.36, + "learning_rate": 4.817413355874894e-05, + "loss": 0.3462, + "step": 432, + "task_loss": 0.5337796807289124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2561354637145996, + "epoch": 0.37, + "learning_rate": 4.816990701606087e-05, + "loss": 0.2544, + "step": 433, + "task_loss": 0.3379628658294678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2789451777935028, + "epoch": 0.37, + "learning_rate": 4.816568047337278e-05, + "loss": 0.3645, + "step": 434, + "task_loss": 0.8729575872421265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.228978231549263, + "epoch": 0.37, + "learning_rate": 4.81614539306847e-05, + "loss": 0.2767, + "step": 435, + "task_loss": 0.10930343717336655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3351702094078064, + "epoch": 0.37, + "learning_rate": 4.815722738799662e-05, + "loss": 0.3671, + "step": 436, + "task_loss": 0.45560285449028015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23894400894641876, + "epoch": 0.37, + "learning_rate": 4.8153000845308535e-05, + "loss": 0.3189, + "step": 437, + "task_loss": 1.1994609832763672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4025985598564148, + "epoch": 0.37, + "learning_rate": 4.814877430262046e-05, + "loss": 0.4706, + "step": 438, + "task_loss": 1.3986836671829224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28973841667175293, + "epoch": 0.37, + "learning_rate": 4.814454775993238e-05, + "loss": 0.3394, + "step": 439, + "task_loss": 0.7910912036895752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18264514207839966, + "epoch": 0.37, + "learning_rate": 4.8140321217244294e-05, + "loss": 0.3787, + "step": 440, + "task_loss": 0.49460968375205994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2948886752128601, + "epoch": 0.37, + "learning_rate": 4.8136094674556214e-05, + "loss": 0.3763, + "step": 441, + "task_loss": 0.28919515013694763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25420182943344116, + "epoch": 0.37, + "learning_rate": 4.8131868131868134e-05, + "loss": 0.2895, + "step": 442, + "task_loss": 0.8527222871780396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40356898307800293, + "epoch": 0.37, + "learning_rate": 4.8127641589180054e-05, + "loss": 0.4178, + "step": 443, + "task_loss": 0.5866603851318359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.392120361328125, + "epoch": 0.38, + "learning_rate": 4.812341504649197e-05, + "loss": 0.4148, + "step": 444, + "task_loss": 0.5381209254264832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29924556612968445, + "epoch": 0.38, + "learning_rate": 4.811918850380389e-05, + "loss": 0.3712, + "step": 445, + "task_loss": 0.7404133081436157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1895020306110382, + "epoch": 0.38, + "learning_rate": 4.811496196111581e-05, + "loss": 0.2502, + "step": 446, + "task_loss": 0.27927011251449585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2506890296936035, + "epoch": 0.38, + "learning_rate": 4.8110735418427726e-05, + "loss": 0.3402, + "step": 447, + "task_loss": 1.0304343700408936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28709763288497925, + "epoch": 0.38, + "learning_rate": 4.8106508875739645e-05, + "loss": 0.5573, + "step": 448, + "task_loss": 0.9209693670272827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39132872223854065, + "epoch": 0.38, + "learning_rate": 4.8102282333051565e-05, + "loss": 0.3815, + "step": 449, + "task_loss": 1.4431852102279663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22359013557434082, + "epoch": 0.38, + "learning_rate": 4.8098055790363485e-05, + "loss": 0.389, + "step": 450, + "task_loss": 0.2309865951538086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3252839148044586, + "epoch": 0.38, + "learning_rate": 4.8093829247675405e-05, + "loss": 0.2982, + "step": 451, + "task_loss": 0.3696426451206207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23491056263446808, + "epoch": 0.38, + "learning_rate": 4.8089602704987324e-05, + "loss": 0.2321, + "step": 452, + "task_loss": 0.05885794386267662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27181416749954224, + "epoch": 0.38, + "learning_rate": 4.808537616229924e-05, + "loss": 0.4129, + "step": 453, + "task_loss": 0.33676445484161377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2541554868221283, + "epoch": 0.38, + "learning_rate": 4.808114961961116e-05, + "loss": 0.3788, + "step": 454, + "task_loss": 0.4041786193847656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1806519329547882, + "epoch": 0.38, + "learning_rate": 4.8076923076923084e-05, + "loss": 0.2849, + "step": 455, + "task_loss": 0.9338952898979187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37558436393737793, + "epoch": 0.39, + "learning_rate": 4.8072696534235e-05, + "loss": 0.3397, + "step": 456, + "task_loss": 0.12076132744550705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48241809010505676, + "epoch": 0.39, + "learning_rate": 4.8068469991546916e-05, + "loss": 0.2795, + "step": 457, + "task_loss": 0.3814953863620758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17230990529060364, + "epoch": 0.39, + "learning_rate": 4.8064243448858836e-05, + "loss": 0.4157, + "step": 458, + "task_loss": 1.0113098621368408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24254800379276276, + "epoch": 0.39, + "learning_rate": 4.8060016906170756e-05, + "loss": 0.362, + "step": 459, + "task_loss": 0.7341673374176025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29597097635269165, + "epoch": 0.39, + "learning_rate": 4.8055790363482676e-05, + "loss": 0.3422, + "step": 460, + "task_loss": 0.7422024607658386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24189169704914093, + "epoch": 0.39, + "learning_rate": 4.8051563820794595e-05, + "loss": 0.3411, + "step": 461, + "task_loss": 1.0566450357437134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21869753301143646, + "epoch": 0.39, + "learning_rate": 4.8047337278106515e-05, + "loss": 0.3826, + "step": 462, + "task_loss": 0.19208063185214996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29792067408561707, + "epoch": 0.39, + "learning_rate": 4.804311073541843e-05, + "loss": 0.3736, + "step": 463, + "task_loss": 0.3594626188278198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5067954659461975, + "epoch": 0.39, + "learning_rate": 4.803888419273035e-05, + "loss": 0.3097, + "step": 464, + "task_loss": 0.3228074312210083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29852721095085144, + "epoch": 0.39, + "learning_rate": 4.803465765004227e-05, + "loss": 0.2871, + "step": 465, + "task_loss": 0.5256339907646179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32675912976264954, + "epoch": 0.39, + "learning_rate": 4.803043110735419e-05, + "loss": 0.3054, + "step": 466, + "task_loss": 0.5943156480789185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29917216300964355, + "epoch": 0.39, + "learning_rate": 4.802620456466611e-05, + "loss": 0.2851, + "step": 467, + "task_loss": 0.3273777961730957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4044845700263977, + "epoch": 0.4, + "learning_rate": 4.802197802197803e-05, + "loss": 0.4042, + "step": 468, + "task_loss": 0.5378451943397522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.12516695261001587, + "epoch": 0.4, + "learning_rate": 4.801775147928994e-05, + "loss": 0.3103, + "step": 469, + "task_loss": 0.06358525902032852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7075236439704895, + "epoch": 0.4, + "learning_rate": 4.801352493660186e-05, + "loss": 0.4171, + "step": 470, + "task_loss": 1.1705182790756226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.367145836353302, + "epoch": 0.4, + "learning_rate": 4.800929839391378e-05, + "loss": 0.3726, + "step": 471, + "task_loss": 0.8300154805183411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30779290199279785, + "epoch": 0.4, + "learning_rate": 4.80050718512257e-05, + "loss": 0.3512, + "step": 472, + "task_loss": 0.9597095251083374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.468344509601593, + "epoch": 0.4, + "learning_rate": 4.800084530853762e-05, + "loss": 0.3587, + "step": 473, + "task_loss": 0.5425146818161011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38902226090431213, + "epoch": 0.4, + "learning_rate": 4.799661876584954e-05, + "loss": 0.268, + "step": 474, + "task_loss": 0.32854801416397095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2791878283023834, + "epoch": 0.4, + "learning_rate": 4.799239222316146e-05, + "loss": 0.3196, + "step": 475, + "task_loss": 0.4752495288848877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24759617447853088, + "epoch": 0.4, + "learning_rate": 4.798816568047337e-05, + "loss": 0.3489, + "step": 476, + "task_loss": 0.7786201238632202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2376200556755066, + "epoch": 0.4, + "learning_rate": 4.79839391377853e-05, + "loss": 0.3212, + "step": 477, + "task_loss": 0.2219945788383484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4746355712413788, + "epoch": 0.4, + "learning_rate": 4.797971259509722e-05, + "loss": 0.4187, + "step": 478, + "task_loss": 1.1486724615097046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38645029067993164, + "epoch": 0.4, + "learning_rate": 4.797548605240913e-05, + "loss": 0.4046, + "step": 479, + "task_loss": 1.1827445030212402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3958989679813385, + "epoch": 0.41, + "learning_rate": 4.797125950972105e-05, + "loss": 0.333, + "step": 480, + "task_loss": 0.9585304856300354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.555884599685669, + "epoch": 0.41, + "learning_rate": 4.796703296703297e-05, + "loss": 0.4266, + "step": 481, + "task_loss": 1.1617844104766846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4958420395851135, + "epoch": 0.41, + "learning_rate": 4.796280642434488e-05, + "loss": 0.4032, + "step": 482, + "task_loss": 0.5177137851715088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33605822920799255, + "epoch": 0.41, + "learning_rate": 4.795857988165681e-05, + "loss": 0.2613, + "step": 483, + "task_loss": 0.7914626002311707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23409125208854675, + "epoch": 0.41, + "learning_rate": 4.795435333896873e-05, + "loss": 0.3058, + "step": 484, + "task_loss": 0.5280863642692566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5291711688041687, + "epoch": 0.41, + "learning_rate": 4.795012679628064e-05, + "loss": 0.433, + "step": 485, + "task_loss": 1.4136499166488647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23166726529598236, + "epoch": 0.41, + "learning_rate": 4.794590025359256e-05, + "loss": 0.3855, + "step": 486, + "task_loss": 0.8607439994812012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2831115126609802, + "epoch": 0.41, + "learning_rate": 4.794167371090448e-05, + "loss": 0.3395, + "step": 487, + "task_loss": 0.775846540927887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2510896623134613, + "epoch": 0.41, + "learning_rate": 4.79374471682164e-05, + "loss": 0.2651, + "step": 488, + "task_loss": 0.3774098753929138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3026653528213501, + "epoch": 0.41, + "learning_rate": 4.793322062552832e-05, + "loss": 0.3462, + "step": 489, + "task_loss": 0.3346295952796936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29465240240097046, + "epoch": 0.41, + "learning_rate": 4.792899408284024e-05, + "loss": 0.3588, + "step": 490, + "task_loss": 0.2848721742630005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24557900428771973, + "epoch": 0.41, + "learning_rate": 4.792476754015216e-05, + "loss": 0.3079, + "step": 491, + "task_loss": 0.4794479012489319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6266154050827026, + "epoch": 0.42, + "learning_rate": 4.792054099746407e-05, + "loss": 0.4814, + "step": 492, + "task_loss": 0.4172706604003906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.466037392616272, + "epoch": 0.42, + "learning_rate": 4.791631445477599e-05, + "loss": 0.3706, + "step": 493, + "task_loss": 0.6787412166595459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28106802701950073, + "epoch": 0.42, + "learning_rate": 4.791208791208792e-05, + "loss": 0.4111, + "step": 494, + "task_loss": 0.06812220066785812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20570380985736847, + "epoch": 0.42, + "learning_rate": 4.790786136939983e-05, + "loss": 0.3746, + "step": 495, + "task_loss": 0.35065531730651855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3599362373352051, + "epoch": 0.42, + "learning_rate": 4.790363482671175e-05, + "loss": 0.3066, + "step": 496, + "task_loss": 0.9745230078697205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4406608045101166, + "epoch": 0.42, + "learning_rate": 4.789940828402367e-05, + "loss": 0.3656, + "step": 497, + "task_loss": 0.3754790723323822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2744731307029724, + "epoch": 0.42, + "learning_rate": 4.7895181741335585e-05, + "loss": 0.393, + "step": 498, + "task_loss": 0.4405898451805115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25891149044036865, + "epoch": 0.42, + "learning_rate": 4.7890955198647505e-05, + "loss": 0.2955, + "step": 499, + "task_loss": 0.5504593849182129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3910437226295471, + "epoch": 0.42, + "learning_rate": 4.788672865595943e-05, + "loss": 0.3011, + "step": 500, + "task_loss": 0.02430226095020771 + }, + { + "epoch": 0.42, + "eval_accuracy": 0.9123960396039604, + "eval_loss": 0.19510453939437866, + "eval_runtime": 327.4971, + "eval_samples_per_second": 77.1, + "eval_steps_per_second": 0.605, + "step": 500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5021883249282837, + "epoch": 0.42, + "learning_rate": 4.7882502113271344e-05, + "loss": 0.4732, + "step": 501, + "task_loss": 1.2089918851852417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19148047268390656, + "epoch": 0.42, + "learning_rate": 4.7878275570583264e-05, + "loss": 0.2744, + "step": 502, + "task_loss": 0.11446057260036469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27991461753845215, + "epoch": 0.42, + "learning_rate": 4.7874049027895184e-05, + "loss": 0.3474, + "step": 503, + "task_loss": 0.5374883413314819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42400461435317993, + "epoch": 0.43, + "learning_rate": 4.7869822485207103e-05, + "loss": 0.4479, + "step": 504, + "task_loss": 0.6829332113265991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19470159709453583, + "epoch": 0.43, + "learning_rate": 4.786559594251902e-05, + "loss": 0.2586, + "step": 505, + "task_loss": 0.402244508266449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3099380135536194, + "epoch": 0.43, + "learning_rate": 4.786136939983094e-05, + "loss": 0.2832, + "step": 506, + "task_loss": 0.5082167983055115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.482271671295166, + "epoch": 0.43, + "learning_rate": 4.785714285714286e-05, + "loss": 0.3306, + "step": 507, + "task_loss": 0.6163338422775269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2019622027873993, + "epoch": 0.43, + "learning_rate": 4.7852916314454776e-05, + "loss": 0.2254, + "step": 508, + "task_loss": 0.2669321894645691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29598677158355713, + "epoch": 0.43, + "learning_rate": 4.7848689771766695e-05, + "loss": 0.3381, + "step": 509, + "task_loss": 0.652377724647522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3812240958213806, + "epoch": 0.43, + "learning_rate": 4.7844463229078615e-05, + "loss": 0.3229, + "step": 510, + "task_loss": 0.361339807510376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2678496241569519, + "epoch": 0.43, + "learning_rate": 4.7840236686390535e-05, + "loss": 0.3488, + "step": 511, + "task_loss": 0.5789118409156799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33221471309661865, + "epoch": 0.43, + "learning_rate": 4.7836010143702455e-05, + "loss": 0.488, + "step": 512, + "task_loss": 0.6737244725227356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35193681716918945, + "epoch": 0.43, + "learning_rate": 4.7831783601014374e-05, + "loss": 0.4167, + "step": 513, + "task_loss": 0.20153209567070007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28466469049453735, + "epoch": 0.43, + "learning_rate": 4.782755705832629e-05, + "loss": 0.3329, + "step": 514, + "task_loss": 0.9720994234085083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32120317220687866, + "epoch": 0.44, + "learning_rate": 4.782333051563821e-05, + "loss": 0.3825, + "step": 515, + "task_loss": 0.7925823926925659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2772601842880249, + "epoch": 0.44, + "learning_rate": 4.781910397295013e-05, + "loss": 0.3274, + "step": 516, + "task_loss": 0.3258647918701172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5394781827926636, + "epoch": 0.44, + "learning_rate": 4.781487743026205e-05, + "loss": 0.3648, + "step": 517, + "task_loss": 1.1306638717651367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43012502789497375, + "epoch": 0.44, + "learning_rate": 4.7810650887573966e-05, + "loss": 0.3062, + "step": 518, + "task_loss": 0.4624011516571045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38749969005584717, + "epoch": 0.44, + "learning_rate": 4.7806424344885886e-05, + "loss": 0.3651, + "step": 519, + "task_loss": 0.966245710849762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6148722171783447, + "epoch": 0.44, + "learning_rate": 4.7802197802197806e-05, + "loss": 0.3658, + "step": 520, + "task_loss": 0.24293069541454315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4697817862033844, + "epoch": 0.44, + "learning_rate": 4.779797125950972e-05, + "loss": 0.4882, + "step": 521, + "task_loss": 0.42285576462745667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2668307423591614, + "epoch": 0.44, + "learning_rate": 4.7793744716821645e-05, + "loss": 0.4171, + "step": 522, + "task_loss": 0.5063718557357788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21127350628376007, + "epoch": 0.44, + "learning_rate": 4.7789518174133565e-05, + "loss": 0.303, + "step": 523, + "task_loss": 0.40686243772506714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5263411998748779, + "epoch": 0.44, + "learning_rate": 4.778529163144548e-05, + "loss": 0.4068, + "step": 524, + "task_loss": 0.7424903512001038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22469529509544373, + "epoch": 0.44, + "learning_rate": 4.77810650887574e-05, + "loss": 0.3641, + "step": 525, + "task_loss": 0.3689127266407013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2438812255859375, + "epoch": 0.44, + "learning_rate": 4.777683854606932e-05, + "loss": 0.2921, + "step": 526, + "task_loss": 0.5155799984931946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23396344482898712, + "epoch": 0.45, + "learning_rate": 4.777261200338124e-05, + "loss": 0.3225, + "step": 527, + "task_loss": 0.9596747159957886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39731040596961975, + "epoch": 0.45, + "learning_rate": 4.776838546069316e-05, + "loss": 0.4376, + "step": 528, + "task_loss": 0.29169049859046936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22641459107398987, + "epoch": 0.45, + "learning_rate": 4.7764158918005077e-05, + "loss": 0.2559, + "step": 529, + "task_loss": 0.14912928640842438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.10016551613807678, + "epoch": 0.45, + "learning_rate": 4.775993237531699e-05, + "loss": 0.2922, + "step": 530, + "task_loss": 0.4573501646518707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28039270639419556, + "epoch": 0.45, + "learning_rate": 4.775570583262891e-05, + "loss": 0.3719, + "step": 531, + "task_loss": 0.1638871431350708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3260061740875244, + "epoch": 0.45, + "learning_rate": 4.775147928994083e-05, + "loss": 0.4424, + "step": 532, + "task_loss": 0.6587778329849243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5293354392051697, + "epoch": 0.45, + "learning_rate": 4.774725274725275e-05, + "loss": 0.4823, + "step": 533, + "task_loss": 0.3607713282108307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2598116993904114, + "epoch": 0.45, + "learning_rate": 4.774302620456467e-05, + "loss": 0.3749, + "step": 534, + "task_loss": 1.022348165512085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2715606391429901, + "epoch": 0.45, + "learning_rate": 4.773879966187659e-05, + "loss": 0.285, + "step": 535, + "task_loss": 0.06552083790302277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3325073719024658, + "epoch": 0.45, + "learning_rate": 4.773457311918851e-05, + "loss": 0.3995, + "step": 536, + "task_loss": 0.3835010826587677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36279869079589844, + "epoch": 0.45, + "learning_rate": 4.773034657650042e-05, + "loss": 0.4089, + "step": 537, + "task_loss": 0.8075441122055054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17732396721839905, + "epoch": 0.45, + "learning_rate": 4.772612003381234e-05, + "loss": 0.287, + "step": 538, + "task_loss": 0.7138428092002869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23727980256080627, + "epoch": 0.46, + "learning_rate": 4.772189349112427e-05, + "loss": 0.2274, + "step": 539, + "task_loss": 0.46086442470550537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3349863290786743, + "epoch": 0.46, + "learning_rate": 4.771766694843618e-05, + "loss": 0.3772, + "step": 540, + "task_loss": 0.38162460923194885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2596428692340851, + "epoch": 0.46, + "learning_rate": 4.77134404057481e-05, + "loss": 0.4017, + "step": 541, + "task_loss": 0.4579458236694336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.267270565032959, + "epoch": 0.46, + "learning_rate": 4.770921386306002e-05, + "loss": 0.284, + "step": 542, + "task_loss": 0.7062729597091675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4117518961429596, + "epoch": 0.46, + "learning_rate": 4.770498732037193e-05, + "loss": 0.298, + "step": 543, + "task_loss": 0.9053981900215149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4807129204273224, + "epoch": 0.46, + "learning_rate": 4.770076077768386e-05, + "loss": 0.3457, + "step": 544, + "task_loss": 0.4797295033931732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35948461294174194, + "epoch": 0.46, + "learning_rate": 4.769653423499578e-05, + "loss": 0.435, + "step": 545, + "task_loss": 0.8107247352600098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14028194546699524, + "epoch": 0.46, + "learning_rate": 4.76923076923077e-05, + "loss": 0.2966, + "step": 546, + "task_loss": 0.24427273869514465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2708081007003784, + "epoch": 0.46, + "learning_rate": 4.768808114961961e-05, + "loss": 0.3261, + "step": 547, + "task_loss": 0.3082984387874603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14703203737735748, + "epoch": 0.46, + "learning_rate": 4.768385460693153e-05, + "loss": 0.2915, + "step": 548, + "task_loss": 0.6045756340026855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2717135548591614, + "epoch": 0.46, + "learning_rate": 4.767962806424345e-05, + "loss": 0.3487, + "step": 549, + "task_loss": 1.257994532585144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2661055028438568, + "epoch": 0.46, + "learning_rate": 4.767540152155537e-05, + "loss": 0.3511, + "step": 550, + "task_loss": 1.0356454849243164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20614999532699585, + "epoch": 0.47, + "learning_rate": 4.767117497886729e-05, + "loss": 0.2855, + "step": 551, + "task_loss": 0.8550474643707275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2185359001159668, + "epoch": 0.47, + "learning_rate": 4.766694843617921e-05, + "loss": 0.3445, + "step": 552, + "task_loss": 0.8826864957809448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19440582394599915, + "epoch": 0.47, + "learning_rate": 4.766272189349112e-05, + "loss": 0.3682, + "step": 553, + "task_loss": 1.3656309843063354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21757423877716064, + "epoch": 0.47, + "learning_rate": 4.765849535080304e-05, + "loss": 0.3358, + "step": 554, + "task_loss": 0.1205420270562172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3195817470550537, + "epoch": 0.47, + "learning_rate": 4.765426880811496e-05, + "loss": 0.4061, + "step": 555, + "task_loss": 0.4966970384120941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24277961254119873, + "epoch": 0.47, + "learning_rate": 4.765004226542688e-05, + "loss": 0.2809, + "step": 556, + "task_loss": 0.28136998414993286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35525980591773987, + "epoch": 0.47, + "learning_rate": 4.76458157227388e-05, + "loss": 0.4081, + "step": 557, + "task_loss": 1.4973515272140503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2915230989456177, + "epoch": 0.47, + "learning_rate": 4.764158918005072e-05, + "loss": 0.2908, + "step": 558, + "task_loss": 0.25230297446250916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4903471767902374, + "epoch": 0.47, + "learning_rate": 4.7637362637362635e-05, + "loss": 0.3884, + "step": 559, + "task_loss": 0.5842353105545044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21438221633434296, + "epoch": 0.47, + "learning_rate": 4.7633136094674555e-05, + "loss": 0.347, + "step": 560, + "task_loss": 0.7824996709823608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3050263524055481, + "epoch": 0.47, + "learning_rate": 4.762890955198648e-05, + "loss": 0.334, + "step": 561, + "task_loss": 0.6036117076873779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42564263939857483, + "epoch": 0.47, + "learning_rate": 4.76246830092984e-05, + "loss": 0.47, + "step": 562, + "task_loss": 0.9398266673088074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2418982982635498, + "epoch": 0.48, + "learning_rate": 4.7620456466610314e-05, + "loss": 0.3335, + "step": 563, + "task_loss": 0.7790709137916565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26156485080718994, + "epoch": 0.48, + "learning_rate": 4.7616229923922234e-05, + "loss": 0.3099, + "step": 564, + "task_loss": 1.1668341159820557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3860306739807129, + "epoch": 0.48, + "learning_rate": 4.761200338123415e-05, + "loss": 0.3577, + "step": 565, + "task_loss": 0.4409308135509491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1832307130098343, + "epoch": 0.48, + "learning_rate": 4.760777683854607e-05, + "loss": 0.2575, + "step": 566, + "task_loss": 0.09991677850484848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35417214035987854, + "epoch": 0.48, + "learning_rate": 4.760355029585799e-05, + "loss": 0.4463, + "step": 567, + "task_loss": 1.2209007740020752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22337156534194946, + "epoch": 0.48, + "learning_rate": 4.759932375316991e-05, + "loss": 0.4262, + "step": 568, + "task_loss": 0.05335812270641327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40127262473106384, + "epoch": 0.48, + "learning_rate": 4.7595097210481825e-05, + "loss": 0.4051, + "step": 569, + "task_loss": 0.9153675436973572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4061581790447235, + "epoch": 0.48, + "learning_rate": 4.7590870667793745e-05, + "loss": 0.3092, + "step": 570, + "task_loss": 0.33734214305877686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3070034384727478, + "epoch": 0.48, + "learning_rate": 4.7586644125105665e-05, + "loss": 0.4155, + "step": 571, + "task_loss": 0.6663563251495361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4043702185153961, + "epoch": 0.48, + "learning_rate": 4.7582417582417585e-05, + "loss": 0.2791, + "step": 572, + "task_loss": 0.3064482808113098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41948115825653076, + "epoch": 0.48, + "learning_rate": 4.7578191039729504e-05, + "loss": 0.3252, + "step": 573, + "task_loss": 0.902721643447876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19369187951087952, + "epoch": 0.48, + "learning_rate": 4.7573964497041424e-05, + "loss": 0.3187, + "step": 574, + "task_loss": 0.7382258176803589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2930937111377716, + "epoch": 0.49, + "learning_rate": 4.7569737954353344e-05, + "loss": 0.4316, + "step": 575, + "task_loss": 0.8327093124389648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4381175637245178, + "epoch": 0.49, + "learning_rate": 4.756551141166526e-05, + "loss": 0.4305, + "step": 576, + "task_loss": 0.4262275993824005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22403079271316528, + "epoch": 0.49, + "learning_rate": 4.7561284868977177e-05, + "loss": 0.3873, + "step": 577, + "task_loss": 1.1204639673233032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3349193334579468, + "epoch": 0.49, + "learning_rate": 4.75570583262891e-05, + "loss": 0.4167, + "step": 578, + "task_loss": 0.880795419216156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23430661857128143, + "epoch": 0.49, + "learning_rate": 4.7552831783601016e-05, + "loss": 0.4672, + "step": 579, + "task_loss": 1.0049666166305542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2768678665161133, + "epoch": 0.49, + "learning_rate": 4.7548605240912936e-05, + "loss": 0.3531, + "step": 580, + "task_loss": 1.0239022970199585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15766771137714386, + "epoch": 0.49, + "learning_rate": 4.7544378698224856e-05, + "loss": 0.281, + "step": 581, + "task_loss": 0.7404004335403442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3777739405632019, + "epoch": 0.49, + "learning_rate": 4.754015215553677e-05, + "loss": 0.4956, + "step": 582, + "task_loss": 1.2778314352035522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24781480431556702, + "epoch": 0.49, + "learning_rate": 4.7535925612848695e-05, + "loss": 0.3808, + "step": 583, + "task_loss": 0.21571092307567596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1052047461271286, + "epoch": 0.49, + "learning_rate": 4.7531699070160615e-05, + "loss": 0.325, + "step": 584, + "task_loss": 0.8777510523796082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44990894198417664, + "epoch": 0.49, + "learning_rate": 4.752747252747253e-05, + "loss": 0.3748, + "step": 585, + "task_loss": 1.3203763961791992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25098681449890137, + "epoch": 0.5, + "learning_rate": 4.752324598478445e-05, + "loss": 0.3174, + "step": 586, + "task_loss": 0.18381808698177338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36845919489860535, + "epoch": 0.5, + "learning_rate": 4.751901944209637e-05, + "loss": 0.3516, + "step": 587, + "task_loss": 0.7729542255401611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48265665769577026, + "epoch": 0.5, + "learning_rate": 4.751479289940829e-05, + "loss": 0.397, + "step": 588, + "task_loss": 0.7330112457275391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22566981613636017, + "epoch": 0.5, + "learning_rate": 4.751056635672021e-05, + "loss": 0.3897, + "step": 589, + "task_loss": 0.40791866183280945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38977566361427307, + "epoch": 0.5, + "learning_rate": 4.7506339814032126e-05, + "loss": 0.3694, + "step": 590, + "task_loss": 1.1916109323501587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22224251925945282, + "epoch": 0.5, + "learning_rate": 4.7502113271344046e-05, + "loss": 0.3209, + "step": 591, + "task_loss": 0.23281393945217133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22563908994197845, + "epoch": 0.5, + "learning_rate": 4.749788672865596e-05, + "loss": 0.324, + "step": 592, + "task_loss": 0.2593800723552704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24176111817359924, + "epoch": 0.5, + "learning_rate": 4.749366018596788e-05, + "loss": 0.2747, + "step": 593, + "task_loss": 0.4232793152332306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14211207628250122, + "epoch": 0.5, + "learning_rate": 4.74894336432798e-05, + "loss": 0.2476, + "step": 594, + "task_loss": 0.28077563643455505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3431089222431183, + "epoch": 0.5, + "learning_rate": 4.748520710059172e-05, + "loss": 0.3743, + "step": 595, + "task_loss": 0.33892443776130676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2833075523376465, + "epoch": 0.5, + "learning_rate": 4.748098055790364e-05, + "loss": 0.3038, + "step": 596, + "task_loss": 0.9755491018295288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18923494219779968, + "epoch": 0.5, + "learning_rate": 4.747675401521556e-05, + "loss": 0.3584, + "step": 597, + "task_loss": 0.2845441997051239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46768224239349365, + "epoch": 0.51, + "learning_rate": 4.747252747252747e-05, + "loss": 0.5477, + "step": 598, + "task_loss": 0.8213875889778137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25206178426742554, + "epoch": 0.51, + "learning_rate": 4.746830092983939e-05, + "loss": 0.3056, + "step": 599, + "task_loss": 0.5093951225280762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3375788927078247, + "epoch": 0.51, + "learning_rate": 4.746407438715132e-05, + "loss": 0.4649, + "step": 600, + "task_loss": 1.0816824436187744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20050367712974548, + "epoch": 0.51, + "learning_rate": 4.745984784446323e-05, + "loss": 0.4619, + "step": 601, + "task_loss": 1.3885235786437988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2921428680419922, + "epoch": 0.51, + "learning_rate": 4.745562130177515e-05, + "loss": 0.4117, + "step": 602, + "task_loss": 0.6163491606712341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3945145010948181, + "epoch": 0.51, + "learning_rate": 4.745139475908707e-05, + "loss": 0.2476, + "step": 603, + "task_loss": 0.5006596446037292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45314669609069824, + "epoch": 0.51, + "learning_rate": 4.744716821639899e-05, + "loss": 0.3132, + "step": 604, + "task_loss": 1.1119205951690674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2548198103904724, + "epoch": 0.51, + "learning_rate": 4.744294167371091e-05, + "loss": 0.2579, + "step": 605, + "task_loss": 0.31535807251930237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24760308861732483, + "epoch": 0.51, + "learning_rate": 4.743871513102283e-05, + "loss": 0.3738, + "step": 606, + "task_loss": 0.14562563598155975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6453937888145447, + "epoch": 0.51, + "learning_rate": 4.743448858833475e-05, + "loss": 0.4029, + "step": 607, + "task_loss": 0.4509686827659607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45204368233680725, + "epoch": 0.51, + "learning_rate": 4.743026204564666e-05, + "loss": 0.3934, + "step": 608, + "task_loss": 0.6939801573753357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2942051291465759, + "epoch": 0.51, + "learning_rate": 4.742603550295858e-05, + "loss": 0.3947, + "step": 609, + "task_loss": 0.5363163352012634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31975531578063965, + "epoch": 0.52, + "learning_rate": 4.74218089602705e-05, + "loss": 0.3381, + "step": 610, + "task_loss": 0.5022314786911011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3351353406906128, + "epoch": 0.52, + "learning_rate": 4.741758241758242e-05, + "loss": 0.3142, + "step": 611, + "task_loss": 0.8475996851921082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26977065205574036, + "epoch": 0.52, + "learning_rate": 4.741335587489434e-05, + "loss": 0.3987, + "step": 612, + "task_loss": 0.6088987588882446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2809360921382904, + "epoch": 0.52, + "learning_rate": 4.740912933220626e-05, + "loss": 0.4348, + "step": 613, + "task_loss": 0.7034056782722473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2232653945684433, + "epoch": 0.52, + "learning_rate": 4.740490278951817e-05, + "loss": 0.3524, + "step": 614, + "task_loss": 1.1150834560394287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.340452641248703, + "epoch": 0.52, + "learning_rate": 4.740067624683009e-05, + "loss": 0.5105, + "step": 615, + "task_loss": 1.2780228853225708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31792181730270386, + "epoch": 0.52, + "learning_rate": 4.739644970414201e-05, + "loss": 0.3358, + "step": 616, + "task_loss": 1.0097323656082153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29654550552368164, + "epoch": 0.52, + "learning_rate": 4.739222316145393e-05, + "loss": 0.3331, + "step": 617, + "task_loss": 1.2019599676132202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24710915982723236, + "epoch": 0.52, + "learning_rate": 4.738799661876585e-05, + "loss": 0.3947, + "step": 618, + "task_loss": 0.6719022989273071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34631967544555664, + "epoch": 0.52, + "learning_rate": 4.738377007607777e-05, + "loss": 0.3772, + "step": 619, + "task_loss": 0.2534567415714264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3382919132709503, + "epoch": 0.52, + "learning_rate": 4.737954353338969e-05, + "loss": 0.4319, + "step": 620, + "task_loss": 0.7812601923942566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4298924207687378, + "epoch": 0.52, + "learning_rate": 4.7375316990701604e-05, + "loss": 0.3914, + "step": 621, + "task_loss": 1.0895129442214966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7368341088294983, + "epoch": 0.53, + "learning_rate": 4.737109044801353e-05, + "loss": 0.4644, + "step": 622, + "task_loss": 1.4669815301895142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1839587688446045, + "epoch": 0.53, + "learning_rate": 4.736686390532545e-05, + "loss": 0.2894, + "step": 623, + "task_loss": 0.10642831772565842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.549903929233551, + "epoch": 0.53, + "learning_rate": 4.7362637362637364e-05, + "loss": 0.4813, + "step": 624, + "task_loss": 0.5704725384712219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.390815407037735, + "epoch": 0.53, + "learning_rate": 4.735841081994928e-05, + "loss": 0.4957, + "step": 625, + "task_loss": 0.3729742765426636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17100507020950317, + "epoch": 0.53, + "learning_rate": 4.73541842772612e-05, + "loss": 0.3505, + "step": 626, + "task_loss": 0.6985848546028137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40018224716186523, + "epoch": 0.53, + "learning_rate": 4.7349957734573116e-05, + "loss": 0.3914, + "step": 627, + "task_loss": 0.48632675409317017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5129507184028625, + "epoch": 0.53, + "learning_rate": 4.734573119188504e-05, + "loss": 0.4269, + "step": 628, + "task_loss": 0.9075433611869812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27295151352882385, + "epoch": 0.53, + "learning_rate": 4.734150464919696e-05, + "loss": 0.2895, + "step": 629, + "task_loss": 0.8488480448722839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4017839729785919, + "epoch": 0.53, + "learning_rate": 4.7337278106508875e-05, + "loss": 0.3767, + "step": 630, + "task_loss": 1.2542822360992432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36919310688972473, + "epoch": 0.53, + "learning_rate": 4.7333051563820795e-05, + "loss": 0.3212, + "step": 631, + "task_loss": 0.3986697494983673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34609749913215637, + "epoch": 0.53, + "learning_rate": 4.7328825021132715e-05, + "loss": 0.4112, + "step": 632, + "task_loss": 0.9559993743896484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4174095392227173, + "epoch": 0.53, + "learning_rate": 4.7324598478444634e-05, + "loss": 0.3243, + "step": 633, + "task_loss": 0.25843098759651184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24086512625217438, + "epoch": 0.54, + "learning_rate": 4.7320371935756554e-05, + "loss": 0.2703, + "step": 634, + "task_loss": 0.8146135210990906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2659895718097687, + "epoch": 0.54, + "learning_rate": 4.7316145393068474e-05, + "loss": 0.3339, + "step": 635, + "task_loss": 0.4702014625072479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33221834897994995, + "epoch": 0.54, + "learning_rate": 4.7311918850380394e-05, + "loss": 0.3936, + "step": 636, + "task_loss": 1.457092046737671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2918318510055542, + "epoch": 0.54, + "learning_rate": 4.730769230769231e-05, + "loss": 0.3259, + "step": 637, + "task_loss": 0.7226035594940186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33243972063064575, + "epoch": 0.54, + "learning_rate": 4.7303465765004226e-05, + "loss": 0.3405, + "step": 638, + "task_loss": 1.0920684337615967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.391266793012619, + "epoch": 0.54, + "learning_rate": 4.729923922231615e-05, + "loss": 0.4497, + "step": 639, + "task_loss": 0.24512942135334015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6080100536346436, + "epoch": 0.54, + "learning_rate": 4.7295012679628066e-05, + "loss": 0.3821, + "step": 640, + "task_loss": 0.40043750405311584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19417530298233032, + "epoch": 0.54, + "learning_rate": 4.7290786136939986e-05, + "loss": 0.3383, + "step": 641, + "task_loss": 0.259618878364563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23771923780441284, + "epoch": 0.54, + "learning_rate": 4.7286559594251905e-05, + "loss": 0.3528, + "step": 642, + "task_loss": 0.9821199774742126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26665687561035156, + "epoch": 0.54, + "learning_rate": 4.728233305156382e-05, + "loss": 0.4123, + "step": 643, + "task_loss": 0.9828779697418213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2349528670310974, + "epoch": 0.54, + "learning_rate": 4.727810650887574e-05, + "loss": 0.3125, + "step": 644, + "task_loss": 0.5495439171791077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37252551317214966, + "epoch": 0.54, + "learning_rate": 4.7273879966187665e-05, + "loss": 0.423, + "step": 645, + "task_loss": 0.6340850591659546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3871510922908783, + "epoch": 0.55, + "learning_rate": 4.726965342349958e-05, + "loss": 0.4527, + "step": 646, + "task_loss": 0.31135350465774536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40592512488365173, + "epoch": 0.55, + "learning_rate": 4.72654268808115e-05, + "loss": 0.365, + "step": 647, + "task_loss": 0.35286909341812134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19760552048683167, + "epoch": 0.55, + "learning_rate": 4.726120033812342e-05, + "loss": 0.4602, + "step": 648, + "task_loss": 0.2875680923461914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2774466872215271, + "epoch": 0.55, + "learning_rate": 4.725697379543534e-05, + "loss": 0.3009, + "step": 649, + "task_loss": 0.40192073583602905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.386837363243103, + "epoch": 0.55, + "learning_rate": 4.7252747252747257e-05, + "loss": 0.3292, + "step": 650, + "task_loss": 0.8110374212265015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2615984380245209, + "epoch": 0.55, + "learning_rate": 4.7248520710059176e-05, + "loss": 0.342, + "step": 651, + "task_loss": 0.7546160221099854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26061612367630005, + "epoch": 0.55, + "learning_rate": 4.7244294167371096e-05, + "loss": 0.292, + "step": 652, + "task_loss": 0.5437497496604919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27350395917892456, + "epoch": 0.55, + "learning_rate": 4.724006762468301e-05, + "loss": 0.3561, + "step": 653, + "task_loss": 0.23291213810443878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.317546546459198, + "epoch": 0.55, + "learning_rate": 4.723584108199493e-05, + "loss": 0.3753, + "step": 654, + "task_loss": 0.5752608776092529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6529425382614136, + "epoch": 0.55, + "learning_rate": 4.723161453930685e-05, + "loss": 0.3672, + "step": 655, + "task_loss": 0.07581650465726852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29290971159935, + "epoch": 0.55, + "learning_rate": 4.722738799661877e-05, + "loss": 0.449, + "step": 656, + "task_loss": 0.593906819820404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22481581568717957, + "epoch": 0.56, + "learning_rate": 4.722316145393069e-05, + "loss": 0.3504, + "step": 657, + "task_loss": 0.557009220123291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28708481788635254, + "epoch": 0.56, + "learning_rate": 4.721893491124261e-05, + "loss": 0.3644, + "step": 658, + "task_loss": 0.6461619734764099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3660014271736145, + "epoch": 0.56, + "learning_rate": 4.721470836855452e-05, + "loss": 0.5447, + "step": 659, + "task_loss": 0.8711457252502441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1999700367450714, + "epoch": 0.56, + "learning_rate": 4.721048182586644e-05, + "loss": 0.4082, + "step": 660, + "task_loss": 0.39707931876182556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6267917156219482, + "epoch": 0.56, + "learning_rate": 4.720625528317836e-05, + "loss": 0.5217, + "step": 661, + "task_loss": 0.8283354043960571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40910154581069946, + "epoch": 0.56, + "learning_rate": 4.7202028740490287e-05, + "loss": 0.4229, + "step": 662, + "task_loss": 0.5927056670188904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2516242563724518, + "epoch": 0.56, + "learning_rate": 4.71978021978022e-05, + "loss": 0.3609, + "step": 663, + "task_loss": 0.14902153611183167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.333596408367157, + "epoch": 0.56, + "learning_rate": 4.719357565511412e-05, + "loss": 0.5049, + "step": 664, + "task_loss": 0.3535808324813843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3168475031852722, + "epoch": 0.56, + "learning_rate": 4.718934911242604e-05, + "loss": 0.3079, + "step": 665, + "task_loss": 0.6552111506462097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33432474732398987, + "epoch": 0.56, + "learning_rate": 4.718512256973795e-05, + "loss": 0.2977, + "step": 666, + "task_loss": 0.39099422097206116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39753422141075134, + "epoch": 0.56, + "learning_rate": 4.718089602704988e-05, + "loss": 0.3301, + "step": 667, + "task_loss": 0.47903263568878174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3049262762069702, + "epoch": 0.56, + "learning_rate": 4.71766694843618e-05, + "loss": 0.4022, + "step": 668, + "task_loss": 0.6022108793258667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35491737723350525, + "epoch": 0.57, + "learning_rate": 4.717244294167371e-05, + "loss": 0.3691, + "step": 669, + "task_loss": 0.7374735474586487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2542036771774292, + "epoch": 0.57, + "learning_rate": 4.716821639898563e-05, + "loss": 0.2639, + "step": 670, + "task_loss": 0.33632931113243103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45093709230422974, + "epoch": 0.57, + "learning_rate": 4.716398985629755e-05, + "loss": 0.3916, + "step": 671, + "task_loss": 1.0387479066848755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47321417927742004, + "epoch": 0.57, + "learning_rate": 4.715976331360947e-05, + "loss": 0.411, + "step": 672, + "task_loss": 0.6558405756950378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37407171726226807, + "epoch": 0.57, + "learning_rate": 4.715553677092139e-05, + "loss": 0.3168, + "step": 673, + "task_loss": 1.0233266353607178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20857971906661987, + "epoch": 0.57, + "learning_rate": 4.715131022823331e-05, + "loss": 0.3295, + "step": 674, + "task_loss": 0.37332332134246826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5774608254432678, + "epoch": 0.57, + "learning_rate": 4.714708368554522e-05, + "loss": 0.525, + "step": 675, + "task_loss": 1.3238980770111084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22712036967277527, + "epoch": 0.57, + "learning_rate": 4.714285714285714e-05, + "loss": 0.4051, + "step": 676, + "task_loss": 0.6897934079170227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17959004640579224, + "epoch": 0.57, + "learning_rate": 4.713863060016906e-05, + "loss": 0.3194, + "step": 677, + "task_loss": 0.3948770761489868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2370845079421997, + "epoch": 0.57, + "learning_rate": 4.713440405748098e-05, + "loss": 0.3015, + "step": 678, + "task_loss": 0.1519782990217209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2547301948070526, + "epoch": 0.57, + "learning_rate": 4.71301775147929e-05, + "loss": 0.3392, + "step": 679, + "task_loss": 0.42188867926597595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3425919711589813, + "epoch": 0.57, + "learning_rate": 4.712595097210482e-05, + "loss": 0.2653, + "step": 680, + "task_loss": 0.8486734628677368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28201133012771606, + "epoch": 0.58, + "learning_rate": 4.712172442941674e-05, + "loss": 0.3232, + "step": 681, + "task_loss": 0.21899619698524475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1665186733007431, + "epoch": 0.58, + "learning_rate": 4.7117497886728654e-05, + "loss": 0.3467, + "step": 682, + "task_loss": 0.3213064968585968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.553528904914856, + "epoch": 0.58, + "learning_rate": 4.7113271344040574e-05, + "loss": 0.396, + "step": 683, + "task_loss": 0.5959455966949463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20433126389980316, + "epoch": 0.58, + "learning_rate": 4.71090448013525e-05, + "loss": 0.274, + "step": 684, + "task_loss": 0.6436899304389954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20822347700595856, + "epoch": 0.58, + "learning_rate": 4.7104818258664413e-05, + "loss": 0.3299, + "step": 685, + "task_loss": 0.4979240596294403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46341797709465027, + "epoch": 0.58, + "learning_rate": 4.710059171597633e-05, + "loss": 0.3165, + "step": 686, + "task_loss": 0.1802392452955246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34004417061805725, + "epoch": 0.58, + "learning_rate": 4.709636517328825e-05, + "loss": 0.3236, + "step": 687, + "task_loss": 0.9467293620109558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1860482394695282, + "epoch": 0.58, + "learning_rate": 4.7092138630600166e-05, + "loss": 0.2697, + "step": 688, + "task_loss": 0.7974063754081726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4567367732524872, + "epoch": 0.58, + "learning_rate": 4.708791208791209e-05, + "loss": 0.3866, + "step": 689, + "task_loss": 0.4629192650318146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1610099971294403, + "epoch": 0.58, + "learning_rate": 4.708368554522401e-05, + "loss": 0.2618, + "step": 690, + "task_loss": 0.3629488945007324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.322256475687027, + "epoch": 0.58, + "learning_rate": 4.707945900253593e-05, + "loss": 0.3583, + "step": 691, + "task_loss": 0.43364420533180237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5227921009063721, + "epoch": 0.58, + "learning_rate": 4.7075232459847845e-05, + "loss": 0.4195, + "step": 692, + "task_loss": 1.0156371593475342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3206583857536316, + "epoch": 0.59, + "learning_rate": 4.7071005917159765e-05, + "loss": 0.4679, + "step": 693, + "task_loss": 1.4880452156066895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6341646313667297, + "epoch": 0.59, + "learning_rate": 4.7066779374471684e-05, + "loss": 0.3936, + "step": 694, + "task_loss": 0.5662680864334106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3810037076473236, + "epoch": 0.59, + "learning_rate": 4.7062552831783604e-05, + "loss": 0.3505, + "step": 695, + "task_loss": 1.1134657859802246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2382390946149826, + "epoch": 0.59, + "learning_rate": 4.7058326289095524e-05, + "loss": 0.3228, + "step": 696, + "task_loss": 0.4972875118255615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5607861280441284, + "epoch": 0.59, + "learning_rate": 4.7054099746407444e-05, + "loss": 0.4341, + "step": 697, + "task_loss": 0.9519200325012207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49214595556259155, + "epoch": 0.59, + "learning_rate": 4.7049873203719357e-05, + "loss": 0.4335, + "step": 698, + "task_loss": 0.3133102357387543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3421843647956848, + "epoch": 0.59, + "learning_rate": 4.7045646661031276e-05, + "loss": 0.434, + "step": 699, + "task_loss": 0.6254091858863831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2969176471233368, + "epoch": 0.59, + "learning_rate": 4.7041420118343196e-05, + "loss": 0.4202, + "step": 700, + "task_loss": 0.47923919558525085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2680771052837372, + "epoch": 0.59, + "learning_rate": 4.7037193575655116e-05, + "loss": 0.4648, + "step": 701, + "task_loss": 0.5146772861480713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26482564210891724, + "epoch": 0.59, + "learning_rate": 4.7032967032967035e-05, + "loss": 0.3071, + "step": 702, + "task_loss": 0.809281051158905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36773034930229187, + "epoch": 0.59, + "learning_rate": 4.7028740490278955e-05, + "loss": 0.3868, + "step": 703, + "task_loss": 0.6665106415748596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18537673354148865, + "epoch": 0.59, + "learning_rate": 4.702451394759087e-05, + "loss": 0.3274, + "step": 704, + "task_loss": 1.5008612871170044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45086485147476196, + "epoch": 0.6, + "learning_rate": 4.702028740490279e-05, + "loss": 0.369, + "step": 705, + "task_loss": 1.3042993545532227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19615305960178375, + "epoch": 0.6, + "learning_rate": 4.7016060862214714e-05, + "loss": 0.3271, + "step": 706, + "task_loss": 0.33956247568130493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.236154705286026, + "epoch": 0.6, + "learning_rate": 4.7011834319526634e-05, + "loss": 0.3975, + "step": 707, + "task_loss": 0.5376070141792297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25039881467819214, + "epoch": 0.6, + "learning_rate": 4.700760777683855e-05, + "loss": 0.3244, + "step": 708, + "task_loss": 0.390200674533844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1740414947271347, + "epoch": 0.6, + "learning_rate": 4.700338123415047e-05, + "loss": 0.5561, + "step": 709, + "task_loss": 0.3755144774913788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24118320643901825, + "epoch": 0.6, + "learning_rate": 4.6999154691462387e-05, + "loss": 0.3875, + "step": 710, + "task_loss": 0.849571943283081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2488633245229721, + "epoch": 0.6, + "learning_rate": 4.6994928148774306e-05, + "loss": 0.3628, + "step": 711, + "task_loss": 1.3228331804275513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40403005480766296, + "epoch": 0.6, + "learning_rate": 4.6990701606086226e-05, + "loss": 0.4502, + "step": 712, + "task_loss": 0.7019414305686951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23417091369628906, + "epoch": 0.6, + "learning_rate": 4.6986475063398146e-05, + "loss": 0.3034, + "step": 713, + "task_loss": 0.18903414905071259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35916635394096375, + "epoch": 0.6, + "learning_rate": 4.698224852071006e-05, + "loss": 0.405, + "step": 714, + "task_loss": 0.11952552944421768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18628966808319092, + "epoch": 0.6, + "learning_rate": 4.697802197802198e-05, + "loss": 0.2843, + "step": 715, + "task_loss": 0.5236517786979675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29364699125289917, + "epoch": 0.6, + "learning_rate": 4.69737954353339e-05, + "loss": 0.3668, + "step": 716, + "task_loss": 0.9426444172859192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2838394045829773, + "epoch": 0.61, + "learning_rate": 4.696956889264582e-05, + "loss": 0.4446, + "step": 717, + "task_loss": 0.7164893746376038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2804027199745178, + "epoch": 0.61, + "learning_rate": 4.696534234995774e-05, + "loss": 0.3231, + "step": 718, + "task_loss": 0.24467986822128296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2521743178367615, + "epoch": 0.61, + "learning_rate": 4.696111580726966e-05, + "loss": 0.3527, + "step": 719, + "task_loss": 1.05292809009552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46222758293151855, + "epoch": 0.61, + "learning_rate": 4.695688926458158e-05, + "loss": 0.5009, + "step": 720, + "task_loss": 0.960830569267273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46111834049224854, + "epoch": 0.61, + "learning_rate": 4.695266272189349e-05, + "loss": 0.3516, + "step": 721, + "task_loss": 0.8451549410820007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3342827558517456, + "epoch": 0.61, + "learning_rate": 4.694843617920541e-05, + "loss": 0.3175, + "step": 722, + "task_loss": 0.5725528597831726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20860952138900757, + "epoch": 0.61, + "learning_rate": 4.6944209636517336e-05, + "loss": 0.3185, + "step": 723, + "task_loss": 0.2770271897315979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31454330682754517, + "epoch": 0.61, + "learning_rate": 4.693998309382925e-05, + "loss": 0.4121, + "step": 724, + "task_loss": 0.6683022975921631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5281179547309875, + "epoch": 0.61, + "learning_rate": 4.693575655114117e-05, + "loss": 0.3287, + "step": 725, + "task_loss": 1.185948133468628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41321808099746704, + "epoch": 0.61, + "learning_rate": 4.693153000845309e-05, + "loss": 0.4766, + "step": 726, + "task_loss": 0.46217796206474304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29537543654441833, + "epoch": 0.61, + "learning_rate": 4.6927303465765e-05, + "loss": 0.3582, + "step": 727, + "task_loss": 0.5150099396705627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22204221785068512, + "epoch": 0.61, + "learning_rate": 4.692307692307693e-05, + "loss": 0.364, + "step": 728, + "task_loss": 0.47867512702941895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22438563406467438, + "epoch": 0.62, + "learning_rate": 4.691885038038885e-05, + "loss": 0.3536, + "step": 729, + "task_loss": 0.4347574710845947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16139662265777588, + "epoch": 0.62, + "learning_rate": 4.691462383770076e-05, + "loss": 0.3303, + "step": 730, + "task_loss": 0.3739808201789856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22806861996650696, + "epoch": 0.62, + "learning_rate": 4.691039729501268e-05, + "loss": 0.369, + "step": 731, + "task_loss": 0.6877012252807617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3352641463279724, + "epoch": 0.62, + "learning_rate": 4.69061707523246e-05, + "loss": 0.3955, + "step": 732, + "task_loss": 1.2132909297943115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3059541583061218, + "epoch": 0.62, + "learning_rate": 4.690194420963652e-05, + "loss": 0.4438, + "step": 733, + "task_loss": 1.191922903060913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43167826533317566, + "epoch": 0.62, + "learning_rate": 4.689771766694844e-05, + "loss": 0.3755, + "step": 734, + "task_loss": 0.6345545053482056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.354397177696228, + "epoch": 0.62, + "learning_rate": 4.689349112426036e-05, + "loss": 0.4559, + "step": 735, + "task_loss": 0.4371297359466553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1969318389892578, + "epoch": 0.62, + "learning_rate": 4.688926458157228e-05, + "loss": 0.2935, + "step": 736, + "task_loss": 0.4778875708580017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3992832899093628, + "epoch": 0.62, + "learning_rate": 4.688503803888419e-05, + "loss": 0.2957, + "step": 737, + "task_loss": 0.18934839963912964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1824398785829544, + "epoch": 0.62, + "learning_rate": 4.688081149619611e-05, + "loss": 0.3074, + "step": 738, + "task_loss": 0.06955501437187195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2365836352109909, + "epoch": 0.62, + "learning_rate": 4.687658495350803e-05, + "loss": 0.3694, + "step": 739, + "task_loss": 0.6758744120597839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23081645369529724, + "epoch": 0.63, + "learning_rate": 4.687235841081995e-05, + "loss": 0.2714, + "step": 740, + "task_loss": 0.8412255048751831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3754105567932129, + "epoch": 0.63, + "learning_rate": 4.686813186813187e-05, + "loss": 0.3583, + "step": 741, + "task_loss": 0.039537061005830765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48759883642196655, + "epoch": 0.63, + "learning_rate": 4.686390532544379e-05, + "loss": 0.403, + "step": 742, + "task_loss": 0.6773413419723511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38453003764152527, + "epoch": 0.63, + "learning_rate": 4.6859678782755704e-05, + "loss": 0.4022, + "step": 743, + "task_loss": 1.2605183124542236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3999258875846863, + "epoch": 0.63, + "learning_rate": 4.6855452240067624e-05, + "loss": 0.3801, + "step": 744, + "task_loss": 1.3958191871643066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46052882075309753, + "epoch": 0.63, + "learning_rate": 4.685122569737955e-05, + "loss": 0.3765, + "step": 745, + "task_loss": 0.43500369787216187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19625413417816162, + "epoch": 0.63, + "learning_rate": 4.684699915469146e-05, + "loss": 0.3736, + "step": 746, + "task_loss": 0.803209125995636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26730993390083313, + "epoch": 0.63, + "learning_rate": 4.684277261200338e-05, + "loss": 0.3664, + "step": 747, + "task_loss": 0.3881904184818268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5900534391403198, + "epoch": 0.63, + "learning_rate": 4.68385460693153e-05, + "loss": 0.4657, + "step": 748, + "task_loss": 0.9224446415901184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4307066798210144, + "epoch": 0.63, + "learning_rate": 4.683431952662722e-05, + "loss": 0.3378, + "step": 749, + "task_loss": 0.3119797110557556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3169203996658325, + "epoch": 0.63, + "learning_rate": 4.683009298393914e-05, + "loss": 0.4756, + "step": 750, + "task_loss": 0.44903531670570374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4722621440887451, + "epoch": 0.63, + "learning_rate": 4.682586644125106e-05, + "loss": 0.3441, + "step": 751, + "task_loss": 0.7778240442276001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21134699881076813, + "epoch": 0.64, + "learning_rate": 4.682163989856298e-05, + "loss": 0.3761, + "step": 752, + "task_loss": 0.2646298408508301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3201712965965271, + "epoch": 0.64, + "learning_rate": 4.6817413355874895e-05, + "loss": 0.3234, + "step": 753, + "task_loss": 1.1199707984924316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2911157011985779, + "epoch": 0.64, + "learning_rate": 4.6813186813186814e-05, + "loss": 0.3709, + "step": 754, + "task_loss": 0.6298675537109375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2521316409111023, + "epoch": 0.64, + "learning_rate": 4.6808960270498734e-05, + "loss": 0.273, + "step": 755, + "task_loss": 0.5158833861351013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.11915340274572372, + "epoch": 0.64, + "learning_rate": 4.6804733727810654e-05, + "loss": 0.2958, + "step": 756, + "task_loss": 0.5755556225776672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2479076087474823, + "epoch": 0.64, + "learning_rate": 4.6800507185122574e-05, + "loss": 0.3772, + "step": 757, + "task_loss": 0.3620893061161041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3179338276386261, + "epoch": 0.64, + "learning_rate": 4.6796280642434493e-05, + "loss": 0.337, + "step": 758, + "task_loss": 0.24180671572685242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.13410022854804993, + "epoch": 0.64, + "learning_rate": 4.6792054099746406e-05, + "loss": 0.2723, + "step": 759, + "task_loss": 0.0071316249668598175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33512306213378906, + "epoch": 0.64, + "learning_rate": 4.6787827557058326e-05, + "loss": 0.3455, + "step": 760, + "task_loss": 0.4476659297943115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1407025307416916, + "epoch": 0.64, + "learning_rate": 4.6783601014370246e-05, + "loss": 0.3167, + "step": 761, + "task_loss": 0.5032293200492859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34007760882377625, + "epoch": 0.64, + "learning_rate": 4.6779374471682166e-05, + "loss": 0.2459, + "step": 762, + "task_loss": 1.0502866506576538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34080156683921814, + "epoch": 0.64, + "learning_rate": 4.6775147928994085e-05, + "loss": 0.4203, + "step": 763, + "task_loss": 0.1957993507385254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40594416856765747, + "epoch": 0.65, + "learning_rate": 4.6770921386306005e-05, + "loss": 0.3926, + "step": 764, + "task_loss": 0.4052309989929199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32644131779670715, + "epoch": 0.65, + "learning_rate": 4.6766694843617925e-05, + "loss": 0.329, + "step": 765, + "task_loss": 0.18310363590717316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22191733121871948, + "epoch": 0.65, + "learning_rate": 4.676246830092984e-05, + "loss": 0.3957, + "step": 766, + "task_loss": 0.4479212462902069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3019043803215027, + "epoch": 0.65, + "learning_rate": 4.6758241758241764e-05, + "loss": 0.2918, + "step": 767, + "task_loss": 0.9111586809158325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24184313416481018, + "epoch": 0.65, + "learning_rate": 4.6754015215553684e-05, + "loss": 0.3361, + "step": 768, + "task_loss": 0.36255335807800293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25905492901802063, + "epoch": 0.65, + "learning_rate": 4.67497886728656e-05, + "loss": 0.3851, + "step": 769, + "task_loss": 1.5149391889572144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32056835293769836, + "epoch": 0.65, + "learning_rate": 4.674556213017752e-05, + "loss": 0.426, + "step": 770, + "task_loss": 0.435532808303833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28691959381103516, + "epoch": 0.65, + "learning_rate": 4.6741335587489436e-05, + "loss": 0.3853, + "step": 771, + "task_loss": 0.24572913348674774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.362898588180542, + "epoch": 0.65, + "learning_rate": 4.673710904480135e-05, + "loss": 0.3312, + "step": 772, + "task_loss": 0.875820517539978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26760318875312805, + "epoch": 0.65, + "learning_rate": 4.6732882502113276e-05, + "loss": 0.372, + "step": 773, + "task_loss": 0.4021333158016205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2651258111000061, + "epoch": 0.65, + "learning_rate": 4.6728655959425196e-05, + "loss": 0.2993, + "step": 774, + "task_loss": 1.2189784049987793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2317265421152115, + "epoch": 0.65, + "learning_rate": 4.672442941673711e-05, + "loss": 0.289, + "step": 775, + "task_loss": 0.8031799793243408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39754974842071533, + "epoch": 0.66, + "learning_rate": 4.672020287404903e-05, + "loss": 0.3329, + "step": 776, + "task_loss": 0.6385886669158936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14862455427646637, + "epoch": 0.66, + "learning_rate": 4.671597633136095e-05, + "loss": 0.2246, + "step": 777, + "task_loss": 0.12774179875850677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4029311537742615, + "epoch": 0.66, + "learning_rate": 4.671174978867287e-05, + "loss": 0.3308, + "step": 778, + "task_loss": 0.7680000066757202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31030893325805664, + "epoch": 0.66, + "learning_rate": 4.670752324598479e-05, + "loss": 0.3138, + "step": 779, + "task_loss": 0.5516699552536011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28797948360443115, + "epoch": 0.66, + "learning_rate": 4.670329670329671e-05, + "loss": 0.2463, + "step": 780, + "task_loss": 0.27283746004104614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22052887082099915, + "epoch": 0.66, + "learning_rate": 4.669907016060863e-05, + "loss": 0.3427, + "step": 781, + "task_loss": 0.666221022605896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48894840478897095, + "epoch": 0.66, + "learning_rate": 4.669484361792054e-05, + "loss": 0.3651, + "step": 782, + "task_loss": 1.259474515914917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3072489798069, + "epoch": 0.66, + "learning_rate": 4.669061707523246e-05, + "loss": 0.2968, + "step": 783, + "task_loss": 0.22879329323768616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35828760266304016, + "epoch": 0.66, + "learning_rate": 4.668639053254438e-05, + "loss": 0.4091, + "step": 784, + "task_loss": 0.590131402015686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2617449164390564, + "epoch": 0.66, + "learning_rate": 4.66821639898563e-05, + "loss": 0.3515, + "step": 785, + "task_loss": 0.5642703175544739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19134745001792908, + "epoch": 0.66, + "learning_rate": 4.667793744716822e-05, + "loss": 0.405, + "step": 786, + "task_loss": 0.09782329946756363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29645299911499023, + "epoch": 0.66, + "learning_rate": 4.667371090448014e-05, + "loss": 0.3337, + "step": 787, + "task_loss": 0.527434229850769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2847943902015686, + "epoch": 0.67, + "learning_rate": 4.666948436179205e-05, + "loss": 0.2979, + "step": 788, + "task_loss": 0.2787574231624603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1994607150554657, + "epoch": 0.67, + "learning_rate": 4.666525781910397e-05, + "loss": 0.277, + "step": 789, + "task_loss": 0.43980374932289124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18303309381008148, + "epoch": 0.67, + "learning_rate": 4.66610312764159e-05, + "loss": 0.2804, + "step": 790, + "task_loss": 0.9812135100364685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5979506969451904, + "epoch": 0.67, + "learning_rate": 4.665680473372781e-05, + "loss": 0.4914, + "step": 791, + "task_loss": 0.4102175533771515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2620372176170349, + "epoch": 0.67, + "learning_rate": 4.665257819103973e-05, + "loss": 0.3372, + "step": 792, + "task_loss": 0.9842884540557861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4323297142982483, + "epoch": 0.67, + "learning_rate": 4.664835164835165e-05, + "loss": 0.3887, + "step": 793, + "task_loss": 0.7159919142723083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3561616539955139, + "epoch": 0.67, + "learning_rate": 4.664412510566357e-05, + "loss": 0.3872, + "step": 794, + "task_loss": 0.49123963713645935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28778600692749023, + "epoch": 0.67, + "learning_rate": 4.663989856297549e-05, + "loss": 0.4273, + "step": 795, + "task_loss": 0.2565106451511383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14981184899806976, + "epoch": 0.67, + "learning_rate": 4.663567202028741e-05, + "loss": 0.2308, + "step": 796, + "task_loss": 0.424303263425827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24332532286643982, + "epoch": 0.67, + "learning_rate": 4.663144547759933e-05, + "loss": 0.317, + "step": 797, + "task_loss": 0.6541764140129089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4180551767349243, + "epoch": 0.67, + "learning_rate": 4.662721893491124e-05, + "loss": 0.3242, + "step": 798, + "task_loss": 1.287262201309204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2564396858215332, + "epoch": 0.67, + "learning_rate": 4.662299239222316e-05, + "loss": 0.4082, + "step": 799, + "task_loss": 0.5781082510948181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15584638714790344, + "epoch": 0.68, + "learning_rate": 4.661876584953508e-05, + "loss": 0.2786, + "step": 800, + "task_loss": 0.6881094574928284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23558972775936127, + "epoch": 0.68, + "learning_rate": 4.6614539306847e-05, + "loss": 0.2289, + "step": 801, + "task_loss": 0.1461523473262787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.304515540599823, + "epoch": 0.68, + "learning_rate": 4.661031276415892e-05, + "loss": 0.4035, + "step": 802, + "task_loss": 0.5185538530349731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2558799386024475, + "epoch": 0.68, + "learning_rate": 4.660608622147084e-05, + "loss": 0.4876, + "step": 803, + "task_loss": 0.6770008206367493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2675953209400177, + "epoch": 0.68, + "learning_rate": 4.6601859678782754e-05, + "loss": 0.2623, + "step": 804, + "task_loss": 1.2368402481079102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3183937072753906, + "epoch": 0.68, + "learning_rate": 4.6597633136094674e-05, + "loss": 0.3384, + "step": 805, + "task_loss": 0.8166303634643555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25635120272636414, + "epoch": 0.68, + "learning_rate": 4.6593406593406593e-05, + "loss": 0.3408, + "step": 806, + "task_loss": 0.3825168311595917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2623569369316101, + "epoch": 0.68, + "learning_rate": 4.658918005071852e-05, + "loss": 0.2882, + "step": 807, + "task_loss": 0.23788872361183167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4122350811958313, + "epoch": 0.68, + "learning_rate": 4.658495350803043e-05, + "loss": 0.3606, + "step": 808, + "task_loss": 0.8126048445701599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48980242013931274, + "epoch": 0.68, + "learning_rate": 4.658072696534235e-05, + "loss": 0.2975, + "step": 809, + "task_loss": 1.1203809976577759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49316003918647766, + "epoch": 0.68, + "learning_rate": 4.657650042265427e-05, + "loss": 0.3358, + "step": 810, + "task_loss": 0.8615176677703857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1466982662677765, + "epoch": 0.69, + "learning_rate": 4.6572273879966185e-05, + "loss": 0.331, + "step": 811, + "task_loss": 0.013799971900880337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18643391132354736, + "epoch": 0.69, + "learning_rate": 4.656804733727811e-05, + "loss": 0.3577, + "step": 812, + "task_loss": 0.11287066340446472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2176889330148697, + "epoch": 0.69, + "learning_rate": 4.656382079459003e-05, + "loss": 0.2355, + "step": 813, + "task_loss": 0.2188456952571869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30954548716545105, + "epoch": 0.69, + "learning_rate": 4.6559594251901945e-05, + "loss": 0.3371, + "step": 814, + "task_loss": 0.6223635673522949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17586016654968262, + "epoch": 0.69, + "learning_rate": 4.6555367709213864e-05, + "loss": 0.4006, + "step": 815, + "task_loss": 0.047391582280397415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2198064625263214, + "epoch": 0.69, + "learning_rate": 4.6551141166525784e-05, + "loss": 0.3579, + "step": 816, + "task_loss": 0.18386153876781464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19883403182029724, + "epoch": 0.69, + "learning_rate": 4.6546914623837704e-05, + "loss": 0.2644, + "step": 817, + "task_loss": 0.8972535133361816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3488885760307312, + "epoch": 0.69, + "learning_rate": 4.6542688081149624e-05, + "loss": 0.3515, + "step": 818, + "task_loss": 0.8315311670303345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34419578313827515, + "epoch": 0.69, + "learning_rate": 4.653846153846154e-05, + "loss": 0.2717, + "step": 819, + "task_loss": 0.7635257840156555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3127756118774414, + "epoch": 0.69, + "learning_rate": 4.6534234995773456e-05, + "loss": 0.327, + "step": 820, + "task_loss": 0.28192371129989624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17077802121639252, + "epoch": 0.69, + "learning_rate": 4.6530008453085376e-05, + "loss": 0.2726, + "step": 821, + "task_loss": 0.5848979353904724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3321095108985901, + "epoch": 0.69, + "learning_rate": 4.6525781910397296e-05, + "loss": 0.3545, + "step": 822, + "task_loss": 1.1666940450668335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17902451753616333, + "epoch": 0.7, + "learning_rate": 4.6521555367709215e-05, + "loss": 0.3196, + "step": 823, + "task_loss": 0.2585326135158539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2681087255477905, + "epoch": 0.7, + "learning_rate": 4.6517328825021135e-05, + "loss": 0.3005, + "step": 824, + "task_loss": 0.5993656516075134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17954730987548828, + "epoch": 0.7, + "learning_rate": 4.6513102282333055e-05, + "loss": 0.2502, + "step": 825, + "task_loss": 0.31453436613082886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2539949417114258, + "epoch": 0.7, + "learning_rate": 4.6508875739644975e-05, + "loss": 0.2447, + "step": 826, + "task_loss": 0.5321019291877747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4174681603908539, + "epoch": 0.7, + "learning_rate": 4.650464919695689e-05, + "loss": 0.3154, + "step": 827, + "task_loss": 0.7458959817886353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46995866298675537, + "epoch": 0.7, + "learning_rate": 4.650042265426881e-05, + "loss": 0.3563, + "step": 828, + "task_loss": 0.34809648990631104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3645036816596985, + "epoch": 0.7, + "learning_rate": 4.6496196111580734e-05, + "loss": 0.3703, + "step": 829, + "task_loss": 0.5023232698440552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5025608539581299, + "epoch": 0.7, + "learning_rate": 4.649196956889265e-05, + "loss": 0.3532, + "step": 830, + "task_loss": 0.3286297619342804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4966927468776703, + "epoch": 0.7, + "learning_rate": 4.6487743026204567e-05, + "loss": 0.3588, + "step": 831, + "task_loss": 1.2833762168884277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2537088394165039, + "epoch": 0.7, + "learning_rate": 4.6483516483516486e-05, + "loss": 0.2375, + "step": 832, + "task_loss": 0.506493091583252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4204622507095337, + "epoch": 0.7, + "learning_rate": 4.64792899408284e-05, + "loss": 0.4207, + "step": 833, + "task_loss": 1.147786259651184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4535294771194458, + "epoch": 0.7, + "learning_rate": 4.6475063398140326e-05, + "loss": 0.2769, + "step": 834, + "task_loss": 0.2901158928871155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18148492276668549, + "epoch": 0.71, + "learning_rate": 4.6470836855452246e-05, + "loss": 0.298, + "step": 835, + "task_loss": 0.49322766065597534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2675839066505432, + "epoch": 0.71, + "learning_rate": 4.6466610312764165e-05, + "loss": 0.2872, + "step": 836, + "task_loss": 0.7758223414421082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4015640914440155, + "epoch": 0.71, + "learning_rate": 4.646238377007608e-05, + "loss": 0.3793, + "step": 837, + "task_loss": 0.6204982995986938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2344849705696106, + "epoch": 0.71, + "learning_rate": 4.6458157227388e-05, + "loss": 0.3693, + "step": 838, + "task_loss": 0.8100611567497253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4160458743572235, + "epoch": 0.71, + "learning_rate": 4.645393068469992e-05, + "loss": 0.3833, + "step": 839, + "task_loss": 0.20661017298698425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3408738374710083, + "epoch": 0.71, + "learning_rate": 4.644970414201184e-05, + "loss": 0.3255, + "step": 840, + "task_loss": 0.8161397576332092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28762492537498474, + "epoch": 0.71, + "learning_rate": 4.644547759932376e-05, + "loss": 0.4359, + "step": 841, + "task_loss": 0.6441768407821655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4925372004508972, + "epoch": 0.71, + "learning_rate": 4.644125105663568e-05, + "loss": 0.4242, + "step": 842, + "task_loss": 0.5591533184051514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2857370674610138, + "epoch": 0.71, + "learning_rate": 4.643702451394759e-05, + "loss": 0.3492, + "step": 843, + "task_loss": 0.14419540762901306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1425875723361969, + "epoch": 0.71, + "learning_rate": 4.643279797125951e-05, + "loss": 0.2699, + "step": 844, + "task_loss": 0.029374532401561737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2444223165512085, + "epoch": 0.71, + "learning_rate": 4.642857142857143e-05, + "loss": 0.2677, + "step": 845, + "task_loss": 0.38509276509284973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5488537549972534, + "epoch": 0.71, + "learning_rate": 4.642434488588335e-05, + "loss": 0.4558, + "step": 846, + "task_loss": 0.3536800444126129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18831950426101685, + "epoch": 0.72, + "learning_rate": 4.642011834319527e-05, + "loss": 0.2165, + "step": 847, + "task_loss": 1.4712738990783691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20634722709655762, + "epoch": 0.72, + "learning_rate": 4.641589180050719e-05, + "loss": 0.3429, + "step": 848, + "task_loss": 0.8899732828140259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48802900314331055, + "epoch": 0.72, + "learning_rate": 4.64116652578191e-05, + "loss": 0.3858, + "step": 849, + "task_loss": 0.7615939378738403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23451358079910278, + "epoch": 0.72, + "learning_rate": 4.640743871513102e-05, + "loss": 0.3009, + "step": 850, + "task_loss": 1.2422040700912476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.361864298582077, + "epoch": 0.72, + "learning_rate": 4.640321217244295e-05, + "loss": 0.4675, + "step": 851, + "task_loss": 0.8000878095626831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20752942562103271, + "epoch": 0.72, + "learning_rate": 4.639898562975487e-05, + "loss": 0.2776, + "step": 852, + "task_loss": 0.3167298138141632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.371992290019989, + "epoch": 0.72, + "learning_rate": 4.639475908706678e-05, + "loss": 0.3352, + "step": 853, + "task_loss": 0.3330743908882141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35249629616737366, + "epoch": 0.72, + "learning_rate": 4.63905325443787e-05, + "loss": 0.2617, + "step": 854, + "task_loss": 0.25995922088623047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46947115659713745, + "epoch": 0.72, + "learning_rate": 4.638630600169062e-05, + "loss": 0.4313, + "step": 855, + "task_loss": 0.8979749083518982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3745986819267273, + "epoch": 0.72, + "learning_rate": 4.638207945900254e-05, + "loss": 0.2747, + "step": 856, + "task_loss": 0.10766928642988205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32626044750213623, + "epoch": 0.72, + "learning_rate": 4.637785291631446e-05, + "loss": 0.4161, + "step": 857, + "task_loss": 0.4534900188446045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33935442566871643, + "epoch": 0.72, + "learning_rate": 4.637362637362638e-05, + "loss": 0.3865, + "step": 858, + "task_loss": 1.0689483880996704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3594878017902374, + "epoch": 0.73, + "learning_rate": 4.636939983093829e-05, + "loss": 0.2921, + "step": 859, + "task_loss": 1.1598310470581055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2799212634563446, + "epoch": 0.73, + "learning_rate": 4.636517328825021e-05, + "loss": 0.3019, + "step": 860, + "task_loss": 0.27010488510131836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41094690561294556, + "epoch": 0.73, + "learning_rate": 4.636094674556213e-05, + "loss": 0.3601, + "step": 861, + "task_loss": 0.22743143141269684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2764747440814972, + "epoch": 0.73, + "learning_rate": 4.635672020287405e-05, + "loss": 0.3251, + "step": 862, + "task_loss": 0.6687231063842773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47841474413871765, + "epoch": 0.73, + "learning_rate": 4.635249366018597e-05, + "loss": 0.2957, + "step": 863, + "task_loss": 0.2392089068889618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2663487493991852, + "epoch": 0.73, + "learning_rate": 4.634826711749789e-05, + "loss": 0.2701, + "step": 864, + "task_loss": 0.3156798481941223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3532913625240326, + "epoch": 0.73, + "learning_rate": 4.634404057480981e-05, + "loss": 0.3206, + "step": 865, + "task_loss": 0.9060441255569458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24406850337982178, + "epoch": 0.73, + "learning_rate": 4.6339814032121724e-05, + "loss": 0.3477, + "step": 866, + "task_loss": 0.3608352541923523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2158551812171936, + "epoch": 0.73, + "learning_rate": 4.633558748943364e-05, + "loss": 0.28, + "step": 867, + "task_loss": 1.2199188470840454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22323551774024963, + "epoch": 0.73, + "learning_rate": 4.633136094674557e-05, + "loss": 0.3782, + "step": 868, + "task_loss": 0.14097052812576294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30778181552886963, + "epoch": 0.73, + "learning_rate": 4.632713440405748e-05, + "loss": 0.3384, + "step": 869, + "task_loss": 0.597816526889801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2497592568397522, + "epoch": 0.73, + "learning_rate": 4.63229078613694e-05, + "loss": 0.3245, + "step": 870, + "task_loss": 0.6203656196594238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2544447183609009, + "epoch": 0.74, + "learning_rate": 4.631868131868132e-05, + "loss": 0.2809, + "step": 871, + "task_loss": 0.9502989053726196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33307886123657227, + "epoch": 0.74, + "learning_rate": 4.6314454775993235e-05, + "loss": 0.361, + "step": 872, + "task_loss": 0.46259069442749023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21646437048912048, + "epoch": 0.74, + "learning_rate": 4.631022823330516e-05, + "loss": 0.236, + "step": 873, + "task_loss": 0.5346273183822632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1888790726661682, + "epoch": 0.74, + "learning_rate": 4.630600169061708e-05, + "loss": 0.3321, + "step": 874, + "task_loss": 0.37077441811561584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24533206224441528, + "epoch": 0.74, + "learning_rate": 4.6301775147928994e-05, + "loss": 0.3313, + "step": 875, + "task_loss": 0.47546735405921936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36210837960243225, + "epoch": 0.74, + "learning_rate": 4.6297548605240914e-05, + "loss": 0.3366, + "step": 876, + "task_loss": 0.5125508308410645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27049046754837036, + "epoch": 0.74, + "learning_rate": 4.6293322062552834e-05, + "loss": 0.2747, + "step": 877, + "task_loss": 0.748110830783844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3963201642036438, + "epoch": 0.74, + "learning_rate": 4.6289095519864754e-05, + "loss": 0.3637, + "step": 878, + "task_loss": 0.707857072353363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4828207194805145, + "epoch": 0.74, + "learning_rate": 4.628486897717667e-05, + "loss": 0.3784, + "step": 879, + "task_loss": 0.6764346957206726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5259439945220947, + "epoch": 0.74, + "learning_rate": 4.628064243448859e-05, + "loss": 0.3544, + "step": 880, + "task_loss": 0.8510361909866333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2657608687877655, + "epoch": 0.74, + "learning_rate": 4.627641589180051e-05, + "loss": 0.2309, + "step": 881, + "task_loss": 0.23221217095851898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21574905514717102, + "epoch": 0.75, + "learning_rate": 4.6272189349112426e-05, + "loss": 0.3528, + "step": 882, + "task_loss": 0.24643269181251526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2436504065990448, + "epoch": 0.75, + "learning_rate": 4.6267962806424346e-05, + "loss": 0.3622, + "step": 883, + "task_loss": 0.7056463360786438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32132038474082947, + "epoch": 0.75, + "learning_rate": 4.6263736263736265e-05, + "loss": 0.3267, + "step": 884, + "task_loss": 0.1795760542154312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2810857594013214, + "epoch": 0.75, + "learning_rate": 4.6259509721048185e-05, + "loss": 0.3109, + "step": 885, + "task_loss": 1.6597498655319214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4146636724472046, + "epoch": 0.75, + "learning_rate": 4.6255283178360105e-05, + "loss": 0.309, + "step": 886, + "task_loss": 0.8720802664756775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3879651129245758, + "epoch": 0.75, + "learning_rate": 4.6251056635672024e-05, + "loss": 0.328, + "step": 887, + "task_loss": 0.4393438398838043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38941875100135803, + "epoch": 0.75, + "learning_rate": 4.624683009298394e-05, + "loss": 0.3296, + "step": 888, + "task_loss": 0.7270287871360779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28111058473587036, + "epoch": 0.75, + "learning_rate": 4.624260355029586e-05, + "loss": 0.2789, + "step": 889, + "task_loss": 0.5853151082992554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14975634217262268, + "epoch": 0.75, + "learning_rate": 4.6238377007607784e-05, + "loss": 0.2629, + "step": 890, + "task_loss": 0.4016755521297455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5038657188415527, + "epoch": 0.75, + "learning_rate": 4.62341504649197e-05, + "loss": 0.3411, + "step": 891, + "task_loss": 1.3833951950073242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1345922350883484, + "epoch": 0.75, + "learning_rate": 4.6229923922231616e-05, + "loss": 0.2325, + "step": 892, + "task_loss": 0.5056750178337097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31020522117614746, + "epoch": 0.75, + "learning_rate": 4.6225697379543536e-05, + "loss": 0.4206, + "step": 893, + "task_loss": 0.39337393641471863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2663968801498413, + "epoch": 0.76, + "learning_rate": 4.6221470836855456e-05, + "loss": 0.3171, + "step": 894, + "task_loss": 1.3517099618911743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3160415589809418, + "epoch": 0.76, + "learning_rate": 4.621724429416737e-05, + "loss": 0.3034, + "step": 895, + "task_loss": 0.10319017618894577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2861885726451874, + "epoch": 0.76, + "learning_rate": 4.6213017751479295e-05, + "loss": 0.4249, + "step": 896, + "task_loss": 0.3836461305618286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36938178539276123, + "epoch": 0.76, + "learning_rate": 4.6208791208791215e-05, + "loss": 0.3556, + "step": 897, + "task_loss": 1.2703704833984375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3103448748588562, + "epoch": 0.76, + "learning_rate": 4.620456466610313e-05, + "loss": 0.314, + "step": 898, + "task_loss": 0.19264638423919678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31108522415161133, + "epoch": 0.76, + "learning_rate": 4.620033812341505e-05, + "loss": 0.3194, + "step": 899, + "task_loss": 1.2347233295440674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25132980942726135, + "epoch": 0.76, + "learning_rate": 4.619611158072697e-05, + "loss": 0.3871, + "step": 900, + "task_loss": 0.28908833861351013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1578858196735382, + "epoch": 0.76, + "learning_rate": 4.619188503803889e-05, + "loss": 0.2732, + "step": 901, + "task_loss": 0.9690726399421692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5275071859359741, + "epoch": 0.76, + "learning_rate": 4.618765849535081e-05, + "loss": 0.2932, + "step": 902, + "task_loss": 0.870735764503479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2782747149467468, + "epoch": 0.76, + "learning_rate": 4.618343195266273e-05, + "loss": 0.3723, + "step": 903, + "task_loss": 0.8498339653015137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5097308158874512, + "epoch": 0.76, + "learning_rate": 4.617920540997464e-05, + "loss": 0.3943, + "step": 904, + "task_loss": 1.1436095237731934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43819373846054077, + "epoch": 0.76, + "learning_rate": 4.617497886728656e-05, + "loss": 0.319, + "step": 905, + "task_loss": 0.8787400126457214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19291357696056366, + "epoch": 0.77, + "learning_rate": 4.617075232459848e-05, + "loss": 0.3443, + "step": 906, + "task_loss": 0.9966549277305603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4310792088508606, + "epoch": 0.77, + "learning_rate": 4.61665257819104e-05, + "loss": 0.4374, + "step": 907, + "task_loss": 0.5687097907066345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3214266300201416, + "epoch": 0.77, + "learning_rate": 4.616229923922232e-05, + "loss": 0.3351, + "step": 908, + "task_loss": 1.3388911485671997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38909125328063965, + "epoch": 0.77, + "learning_rate": 4.615807269653424e-05, + "loss": 0.4006, + "step": 909, + "task_loss": 0.4524170160293579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22019779682159424, + "epoch": 0.77, + "learning_rate": 4.615384615384616e-05, + "loss": 0.3276, + "step": 910, + "task_loss": 0.6273325681686401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33304253220558167, + "epoch": 0.77, + "learning_rate": 4.614961961115807e-05, + "loss": 0.347, + "step": 911, + "task_loss": 0.2653331160545349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3374251425266266, + "epoch": 0.77, + "learning_rate": 4.614539306846999e-05, + "loss": 0.3965, + "step": 912, + "task_loss": 0.6541954874992371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26544278860092163, + "epoch": 0.77, + "learning_rate": 4.614116652578192e-05, + "loss": 0.2837, + "step": 913, + "task_loss": 1.2303000688552856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2579417824745178, + "epoch": 0.77, + "learning_rate": 4.613693998309383e-05, + "loss": 0.3092, + "step": 914, + "task_loss": 1.1536496877670288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26663273572921753, + "epoch": 0.77, + "learning_rate": 4.613271344040575e-05, + "loss": 0.2463, + "step": 915, + "task_loss": 0.7196165323257446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28072217106819153, + "epoch": 0.77, + "learning_rate": 4.612848689771767e-05, + "loss": 0.2591, + "step": 916, + "task_loss": 0.5546468496322632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23951895534992218, + "epoch": 0.77, + "learning_rate": 4.612426035502958e-05, + "loss": 0.3292, + "step": 917, + "task_loss": 1.281814694404602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.432782918214798, + "epoch": 0.78, + "learning_rate": 4.612003381234151e-05, + "loss": 0.3524, + "step": 918, + "task_loss": 0.5389975309371948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2952663004398346, + "epoch": 0.78, + "learning_rate": 4.611580726965343e-05, + "loss": 0.317, + "step": 919, + "task_loss": 0.9771247506141663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46760207414627075, + "epoch": 0.78, + "learning_rate": 4.611158072696534e-05, + "loss": 0.2948, + "step": 920, + "task_loss": 1.008791208267212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21138301491737366, + "epoch": 0.78, + "learning_rate": 4.610735418427726e-05, + "loss": 0.275, + "step": 921, + "task_loss": 0.5766026377677917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3450990915298462, + "epoch": 0.78, + "learning_rate": 4.610312764158918e-05, + "loss": 0.336, + "step": 922, + "task_loss": 0.39770257472991943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24819274246692657, + "epoch": 0.78, + "learning_rate": 4.60989010989011e-05, + "loss": 0.3951, + "step": 923, + "task_loss": 1.1994588375091553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26219120621681213, + "epoch": 0.78, + "learning_rate": 4.609467455621302e-05, + "loss": 0.3992, + "step": 924, + "task_loss": 1.3631459474563599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39332079887390137, + "epoch": 0.78, + "learning_rate": 4.609044801352494e-05, + "loss": 0.4493, + "step": 925, + "task_loss": 1.2999727725982666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2386210858821869, + "epoch": 0.78, + "learning_rate": 4.608622147083686e-05, + "loss": 0.27, + "step": 926, + "task_loss": 0.5237141251564026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31889891624450684, + "epoch": 0.78, + "learning_rate": 4.608199492814877e-05, + "loss": 0.2055, + "step": 927, + "task_loss": 0.12464319914579391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29726114869117737, + "epoch": 0.78, + "learning_rate": 4.607776838546069e-05, + "loss": 0.3837, + "step": 928, + "task_loss": 0.2828030288219452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5032831430435181, + "epoch": 0.78, + "learning_rate": 4.607354184277261e-05, + "loss": 0.3447, + "step": 929, + "task_loss": 0.8295583128929138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31472247838974, + "epoch": 0.79, + "learning_rate": 4.606931530008453e-05, + "loss": 0.3694, + "step": 930, + "task_loss": 1.5383825302124023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27644604444503784, + "epoch": 0.79, + "learning_rate": 4.606508875739645e-05, + "loss": 0.3165, + "step": 931, + "task_loss": 0.7559335231781006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18283003568649292, + "epoch": 0.79, + "learning_rate": 4.606086221470837e-05, + "loss": 0.1992, + "step": 932, + "task_loss": 0.144075408577919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4631301164627075, + "epoch": 0.79, + "learning_rate": 4.6056635672020285e-05, + "loss": 0.3971, + "step": 933, + "task_loss": 1.6281708478927612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42738059163093567, + "epoch": 0.79, + "learning_rate": 4.6052409129332205e-05, + "loss": 0.3377, + "step": 934, + "task_loss": 0.6741576790809631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3566298186779022, + "epoch": 0.79, + "learning_rate": 4.604818258664413e-05, + "loss": 0.3676, + "step": 935, + "task_loss": 0.27528154850006104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.330236554145813, + "epoch": 0.79, + "learning_rate": 4.6043956043956044e-05, + "loss": 0.3648, + "step": 936, + "task_loss": 0.20326869189739227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1568087637424469, + "epoch": 0.79, + "learning_rate": 4.6039729501267964e-05, + "loss": 0.3553, + "step": 937, + "task_loss": 0.3123857080936432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5085207223892212, + "epoch": 0.79, + "learning_rate": 4.6035502958579884e-05, + "loss": 0.3909, + "step": 938, + "task_loss": 0.6514917016029358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.154745414853096, + "epoch": 0.79, + "learning_rate": 4.6031276415891803e-05, + "loss": 0.3043, + "step": 939, + "task_loss": 0.2217102348804474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26345983147621155, + "epoch": 0.79, + "learning_rate": 4.602704987320372e-05, + "loss": 0.3337, + "step": 940, + "task_loss": 1.1459896564483643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32549530267715454, + "epoch": 0.79, + "learning_rate": 4.602282333051564e-05, + "loss": 0.2628, + "step": 941, + "task_loss": 0.7183759212493896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24363887310028076, + "epoch": 0.8, + "learning_rate": 4.601859678782756e-05, + "loss": 0.322, + "step": 942, + "task_loss": 0.33217161893844604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4149688482284546, + "epoch": 0.8, + "learning_rate": 4.6014370245139476e-05, + "loss": 0.307, + "step": 943, + "task_loss": 0.26546531915664673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48364436626434326, + "epoch": 0.8, + "learning_rate": 4.6010143702451395e-05, + "loss": 0.3325, + "step": 944, + "task_loss": 0.4470929503440857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22630174458026886, + "epoch": 0.8, + "learning_rate": 4.6005917159763315e-05, + "loss": 0.3402, + "step": 945, + "task_loss": 0.6232134699821472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17915762960910797, + "epoch": 0.8, + "learning_rate": 4.6001690617075235e-05, + "loss": 0.2294, + "step": 946, + "task_loss": 0.3601345717906952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28538429737091064, + "epoch": 0.8, + "learning_rate": 4.5997464074387155e-05, + "loss": 0.2676, + "step": 947, + "task_loss": 0.43201231956481934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27372103929519653, + "epoch": 0.8, + "learning_rate": 4.5993237531699074e-05, + "loss": 0.2827, + "step": 948, + "task_loss": 1.5435055494308472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2887795567512512, + "epoch": 0.8, + "learning_rate": 4.598901098901099e-05, + "loss": 0.3111, + "step": 949, + "task_loss": 0.3925054967403412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2809564471244812, + "epoch": 0.8, + "learning_rate": 4.598478444632291e-05, + "loss": 0.2503, + "step": 950, + "task_loss": 0.7698982357978821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25734391808509827, + "epoch": 0.8, + "learning_rate": 4.598055790363483e-05, + "loss": 0.2912, + "step": 951, + "task_loss": 0.41852739453315735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.395840048789978, + "epoch": 0.8, + "learning_rate": 4.597633136094675e-05, + "loss": 0.3705, + "step": 952, + "task_loss": 0.505364716053009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16447016596794128, + "epoch": 0.81, + "learning_rate": 4.5972104818258666e-05, + "loss": 0.3264, + "step": 953, + "task_loss": 0.39294782280921936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3487350046634674, + "epoch": 0.81, + "learning_rate": 4.5967878275570586e-05, + "loss": 0.32, + "step": 954, + "task_loss": 1.0585432052612305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2846418023109436, + "epoch": 0.81, + "learning_rate": 4.5963651732882506e-05, + "loss": 0.3766, + "step": 955, + "task_loss": 0.32126832008361816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.123996302485466, + "epoch": 0.81, + "learning_rate": 4.595942519019442e-05, + "loss": 0.2981, + "step": 956, + "task_loss": 0.019682593643665314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2558656632900238, + "epoch": 0.81, + "learning_rate": 4.5955198647506345e-05, + "loss": 0.2799, + "step": 957, + "task_loss": 0.8895480036735535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48175427317619324, + "epoch": 0.81, + "learning_rate": 4.5950972104818265e-05, + "loss": 0.4012, + "step": 958, + "task_loss": 0.6715694069862366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2468600869178772, + "epoch": 0.81, + "learning_rate": 4.594674556213018e-05, + "loss": 0.3789, + "step": 959, + "task_loss": 0.40784138441085815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28495824337005615, + "epoch": 0.81, + "learning_rate": 4.59425190194421e-05, + "loss": 0.3826, + "step": 960, + "task_loss": 0.3370976746082306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2992165684700012, + "epoch": 0.81, + "learning_rate": 4.593829247675402e-05, + "loss": 0.315, + "step": 961, + "task_loss": 0.10349813103675842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39707690477371216, + "epoch": 0.81, + "learning_rate": 4.593406593406594e-05, + "loss": 0.3413, + "step": 962, + "task_loss": 0.5664233565330505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2814026474952698, + "epoch": 0.81, + "learning_rate": 4.592983939137786e-05, + "loss": 0.3904, + "step": 963, + "task_loss": 1.0049792528152466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19465628266334534, + "epoch": 0.81, + "learning_rate": 4.5925612848689777e-05, + "loss": 0.2879, + "step": 964, + "task_loss": 0.07359443604946136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3513874411582947, + "epoch": 0.82, + "learning_rate": 4.592138630600169e-05, + "loss": 0.3549, + "step": 965, + "task_loss": 0.5460289716720581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33203014731407166, + "epoch": 0.82, + "learning_rate": 4.591715976331361e-05, + "loss": 0.3409, + "step": 966, + "task_loss": 1.481011986732483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27470993995666504, + "epoch": 0.82, + "learning_rate": 4.591293322062553e-05, + "loss": 0.3305, + "step": 967, + "task_loss": 0.4593425393104553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.11648278683423996, + "epoch": 0.82, + "learning_rate": 4.590870667793745e-05, + "loss": 0.321, + "step": 968, + "task_loss": 0.3774500787258148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.353023886680603, + "epoch": 0.82, + "learning_rate": 4.590448013524937e-05, + "loss": 0.3351, + "step": 969, + "task_loss": 1.1915583610534668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2356659471988678, + "epoch": 0.82, + "learning_rate": 4.590025359256129e-05, + "loss": 0.2714, + "step": 970, + "task_loss": 0.4875516891479492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37664133310317993, + "epoch": 0.82, + "learning_rate": 4.589602704987321e-05, + "loss": 0.3707, + "step": 971, + "task_loss": 0.6027059555053711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2550051212310791, + "epoch": 0.82, + "learning_rate": 4.589180050718512e-05, + "loss": 0.269, + "step": 972, + "task_loss": 0.33342424035072327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4495178461074829, + "epoch": 0.82, + "learning_rate": 4.588757396449704e-05, + "loss": 0.3211, + "step": 973, + "task_loss": 0.6046321392059326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21325893700122833, + "epoch": 0.82, + "learning_rate": 4.588334742180897e-05, + "loss": 0.3904, + "step": 974, + "task_loss": 0.1933622509241104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2678370177745819, + "epoch": 0.82, + "learning_rate": 4.587912087912088e-05, + "loss": 0.2613, + "step": 975, + "task_loss": 0.39825576543807983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18920046091079712, + "epoch": 0.82, + "learning_rate": 4.58748943364328e-05, + "loss": 0.4215, + "step": 976, + "task_loss": 0.5402249097824097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23081913590431213, + "epoch": 0.83, + "learning_rate": 4.587066779374472e-05, + "loss": 0.3163, + "step": 977, + "task_loss": 1.4790289402008057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21092328429222107, + "epoch": 0.83, + "learning_rate": 4.586644125105663e-05, + "loss": 0.2447, + "step": 978, + "task_loss": 0.6820929050445557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30764931440353394, + "epoch": 0.83, + "learning_rate": 4.586221470836856e-05, + "loss": 0.3149, + "step": 979, + "task_loss": 0.20430991053581238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2513306140899658, + "epoch": 0.83, + "learning_rate": 4.585798816568048e-05, + "loss": 0.3218, + "step": 980, + "task_loss": 1.1656993627548218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2478112280368805, + "epoch": 0.83, + "learning_rate": 4.58537616229924e-05, + "loss": 0.3601, + "step": 981, + "task_loss": 1.4366612434387207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4809786081314087, + "epoch": 0.83, + "learning_rate": 4.584953508030431e-05, + "loss": 0.3599, + "step": 982, + "task_loss": 0.32178795337677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3879166841506958, + "epoch": 0.83, + "learning_rate": 4.584530853761623e-05, + "loss": 0.3039, + "step": 983, + "task_loss": 0.35262659192085266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20825481414794922, + "epoch": 0.83, + "learning_rate": 4.584108199492815e-05, + "loss": 0.3586, + "step": 984, + "task_loss": 0.8200913071632385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5232114791870117, + "epoch": 0.83, + "learning_rate": 4.583685545224007e-05, + "loss": 0.3648, + "step": 985, + "task_loss": 1.6637427806854248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35127195715904236, + "epoch": 0.83, + "learning_rate": 4.583262890955199e-05, + "loss": 0.3242, + "step": 986, + "task_loss": 1.1167726516723633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3223801255226135, + "epoch": 0.83, + "learning_rate": 4.582840236686391e-05, + "loss": 0.3698, + "step": 987, + "task_loss": 1.1016654968261719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26718980073928833, + "epoch": 0.83, + "learning_rate": 4.582417582417582e-05, + "loss": 0.306, + "step": 988, + "task_loss": 0.582869827747345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.285981684923172, + "epoch": 0.84, + "learning_rate": 4.581994928148774e-05, + "loss": 0.3058, + "step": 989, + "task_loss": 0.9653950333595276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3817080855369568, + "epoch": 0.84, + "learning_rate": 4.581572273879966e-05, + "loss": 0.2993, + "step": 990, + "task_loss": 0.8561366200447083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22112765908241272, + "epoch": 0.84, + "learning_rate": 4.581149619611158e-05, + "loss": 0.2875, + "step": 991, + "task_loss": 0.6691679954528809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2103605717420578, + "epoch": 0.84, + "learning_rate": 4.58072696534235e-05, + "loss": 0.2907, + "step": 992, + "task_loss": 1.021261215209961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39024853706359863, + "epoch": 0.84, + "learning_rate": 4.580304311073542e-05, + "loss": 0.3313, + "step": 993, + "task_loss": 0.17300234735012054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4057610034942627, + "epoch": 0.84, + "learning_rate": 4.5798816568047335e-05, + "loss": 0.334, + "step": 994, + "task_loss": 1.045444369316101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37642043828964233, + "epoch": 0.84, + "learning_rate": 4.5794590025359255e-05, + "loss": 0.3436, + "step": 995, + "task_loss": 0.3469464182853699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3785593509674072, + "epoch": 0.84, + "learning_rate": 4.579036348267118e-05, + "loss": 0.3412, + "step": 996, + "task_loss": 0.40450188517570496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3710609972476959, + "epoch": 0.84, + "learning_rate": 4.57861369399831e-05, + "loss": 0.3786, + "step": 997, + "task_loss": 1.002790927886963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21029534935951233, + "epoch": 0.84, + "learning_rate": 4.5781910397295014e-05, + "loss": 0.3131, + "step": 998, + "task_loss": 1.4204806089401245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2556370496749878, + "epoch": 0.84, + "learning_rate": 4.5777683854606934e-05, + "loss": 0.4118, + "step": 999, + "task_loss": 1.1292375326156616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3558931350708008, + "epoch": 0.84, + "learning_rate": 4.577345731191885e-05, + "loss": 0.2613, + "step": 1000, + "task_loss": 0.49822065234184265 + }, + { + "epoch": 0.84, + "eval_accuracy": 0.913940594059406, + "eval_loss": 0.18967993557453156, + "eval_runtime": 338.0531, + "eval_samples_per_second": 74.692, + "eval_steps_per_second": 0.586, + "step": 1000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23912113904953003, + "epoch": 0.85, + "learning_rate": 4.576923076923077e-05, + "loss": 0.3496, + "step": 1001, + "task_loss": 0.3939993381500244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2952667474746704, + "epoch": 0.85, + "learning_rate": 4.576500422654269e-05, + "loss": 0.3001, + "step": 1002, + "task_loss": 0.44996291399002075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14472924172878265, + "epoch": 0.85, + "learning_rate": 4.576077768385461e-05, + "loss": 0.3498, + "step": 1003, + "task_loss": 0.209279865026474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31769293546676636, + "epoch": 0.85, + "learning_rate": 4.5756551141166525e-05, + "loss": 0.3995, + "step": 1004, + "task_loss": 0.5328032374382019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8548835515975952, + "epoch": 0.85, + "learning_rate": 4.5752324598478445e-05, + "loss": 0.4194, + "step": 1005, + "task_loss": 1.5901955366134644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5097059011459351, + "epoch": 0.85, + "learning_rate": 4.5748098055790365e-05, + "loss": 0.4045, + "step": 1006, + "task_loss": 0.09073382616043091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17884889245033264, + "epoch": 0.85, + "learning_rate": 4.5743871513102285e-05, + "loss": 0.32, + "step": 1007, + "task_loss": 0.22524794936180115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2864243686199188, + "epoch": 0.85, + "learning_rate": 4.5739644970414204e-05, + "loss": 0.35, + "step": 1008, + "task_loss": 0.5916472673416138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39622369408607483, + "epoch": 0.85, + "learning_rate": 4.5735418427726124e-05, + "loss": 0.3036, + "step": 1009, + "task_loss": 1.0695246458053589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3557763397693634, + "epoch": 0.85, + "learning_rate": 4.573119188503804e-05, + "loss": 0.4268, + "step": 1010, + "task_loss": 1.484282374382019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43481993675231934, + "epoch": 0.85, + "learning_rate": 4.572696534234996e-05, + "loss": 0.3944, + "step": 1011, + "task_loss": 0.8948748111724854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14724287390708923, + "epoch": 0.85, + "learning_rate": 4.572273879966188e-05, + "loss": 0.2871, + "step": 1012, + "task_loss": 0.2009107768535614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14038264751434326, + "epoch": 0.86, + "learning_rate": 4.57185122569738e-05, + "loss": 0.2934, + "step": 1013, + "task_loss": 0.16116684675216675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26486438512802124, + "epoch": 0.86, + "learning_rate": 4.5714285714285716e-05, + "loss": 0.3377, + "step": 1014, + "task_loss": 0.20203635096549988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14152994751930237, + "epoch": 0.86, + "learning_rate": 4.5710059171597636e-05, + "loss": 0.2721, + "step": 1015, + "task_loss": 0.9633371233940125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26985055208206177, + "epoch": 0.86, + "learning_rate": 4.5705832628909556e-05, + "loss": 0.3403, + "step": 1016, + "task_loss": 0.47459304332733154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15847118198871613, + "epoch": 0.86, + "learning_rate": 4.570160608622147e-05, + "loss": 0.3049, + "step": 1017, + "task_loss": 0.11324185878038406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36625000834465027, + "epoch": 0.86, + "learning_rate": 4.5697379543533395e-05, + "loss": 0.3174, + "step": 1018, + "task_loss": 0.8263206481933594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42509859800338745, + "epoch": 0.86, + "learning_rate": 4.5693153000845315e-05, + "loss": 0.4844, + "step": 1019, + "task_loss": 0.5466962456703186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25256964564323425, + "epoch": 0.86, + "learning_rate": 4.568892645815723e-05, + "loss": 0.2754, + "step": 1020, + "task_loss": 0.9385986924171448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.13900268077850342, + "epoch": 0.86, + "learning_rate": 4.568469991546915e-05, + "loss": 0.4031, + "step": 1021, + "task_loss": 0.08166217803955078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14120763540267944, + "epoch": 0.86, + "learning_rate": 4.568047337278107e-05, + "loss": 0.2618, + "step": 1022, + "task_loss": 0.01267182920128107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26838311553001404, + "epoch": 0.86, + "learning_rate": 4.567624683009298e-05, + "loss": 0.2616, + "step": 1023, + "task_loss": 1.0019125938415527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1619986593723297, + "epoch": 0.87, + "learning_rate": 4.567202028740491e-05, + "loss": 0.2586, + "step": 1024, + "task_loss": 0.11912364512681961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26556265354156494, + "epoch": 0.87, + "learning_rate": 4.5667793744716826e-05, + "loss": 0.3354, + "step": 1025, + "task_loss": 0.5522617697715759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45204195380210876, + "epoch": 0.87, + "learning_rate": 4.5663567202028746e-05, + "loss": 0.3513, + "step": 1026, + "task_loss": 0.2980015277862549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40704768896102905, + "epoch": 0.87, + "learning_rate": 4.565934065934066e-05, + "loss": 0.3226, + "step": 1027, + "task_loss": 0.3302595317363739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28598201274871826, + "epoch": 0.87, + "learning_rate": 4.565511411665258e-05, + "loss": 0.3815, + "step": 1028, + "task_loss": 0.76263827085495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20379169285297394, + "epoch": 0.87, + "learning_rate": 4.56508875739645e-05, + "loss": 0.3025, + "step": 1029, + "task_loss": 0.5890194773674011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4659903645515442, + "epoch": 0.87, + "learning_rate": 4.564666103127642e-05, + "loss": 0.4342, + "step": 1030, + "task_loss": 0.9342086911201477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33894455432891846, + "epoch": 0.87, + "learning_rate": 4.564243448858834e-05, + "loss": 0.3395, + "step": 1031, + "task_loss": 0.31865039467811584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34021997451782227, + "epoch": 0.87, + "learning_rate": 4.563820794590026e-05, + "loss": 0.3123, + "step": 1032, + "task_loss": 0.14327658712863922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21941176056861877, + "epoch": 0.87, + "learning_rate": 4.563398140321217e-05, + "loss": 0.4326, + "step": 1033, + "task_loss": 0.0961829125881195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3016534447669983, + "epoch": 0.87, + "learning_rate": 4.562975486052409e-05, + "loss": 0.3609, + "step": 1034, + "task_loss": 0.5202248096466064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19246357679367065, + "epoch": 0.87, + "learning_rate": 4.562552831783602e-05, + "loss": 0.2473, + "step": 1035, + "task_loss": 0.512968122959137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31773266196250916, + "epoch": 0.88, + "learning_rate": 4.562130177514793e-05, + "loss": 0.3702, + "step": 1036, + "task_loss": 0.3712160587310791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3228875994682312, + "epoch": 0.88, + "learning_rate": 4.561707523245985e-05, + "loss": 0.502, + "step": 1037, + "task_loss": 1.380858063697815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18843914568424225, + "epoch": 0.88, + "learning_rate": 4.561284868977177e-05, + "loss": 0.2984, + "step": 1038, + "task_loss": 0.1643063724040985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23467159271240234, + "epoch": 0.88, + "learning_rate": 4.560862214708368e-05, + "loss": 0.3617, + "step": 1039, + "task_loss": 0.5294719338417053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2105211764574051, + "epoch": 0.88, + "learning_rate": 4.56043956043956e-05, + "loss": 0.4081, + "step": 1040, + "task_loss": 0.878465473651886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28716641664505005, + "epoch": 0.88, + "learning_rate": 4.560016906170753e-05, + "loss": 0.3827, + "step": 1041, + "task_loss": 0.3960720896720886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3224141299724579, + "epoch": 0.88, + "learning_rate": 4.559594251901945e-05, + "loss": 0.2828, + "step": 1042, + "task_loss": 0.8254625797271729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2405867874622345, + "epoch": 0.88, + "learning_rate": 4.559171597633136e-05, + "loss": 0.2642, + "step": 1043, + "task_loss": 0.47411441802978516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23462903499603271, + "epoch": 0.88, + "learning_rate": 4.558748943364328e-05, + "loss": 0.2545, + "step": 1044, + "task_loss": 0.5556021928787231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20667308568954468, + "epoch": 0.88, + "learning_rate": 4.55832628909552e-05, + "loss": 0.3163, + "step": 1045, + "task_loss": 0.7395960092544556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4931938052177429, + "epoch": 0.88, + "learning_rate": 4.557903634826712e-05, + "loss": 0.3809, + "step": 1046, + "task_loss": 0.9698067903518677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40067774057388306, + "epoch": 0.88, + "learning_rate": 4.557480980557904e-05, + "loss": 0.4271, + "step": 1047, + "task_loss": 0.1059439554810524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15574510395526886, + "epoch": 0.89, + "learning_rate": 4.557058326289096e-05, + "loss": 0.416, + "step": 1048, + "task_loss": 0.5026818513870239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26975488662719727, + "epoch": 0.89, + "learning_rate": 4.556635672020287e-05, + "loss": 0.4143, + "step": 1049, + "task_loss": 0.6446378231048584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3566000759601593, + "epoch": 0.89, + "learning_rate": 4.556213017751479e-05, + "loss": 0.3302, + "step": 1050, + "task_loss": 0.7665790915489197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21607786417007446, + "epoch": 0.89, + "learning_rate": 4.555790363482671e-05, + "loss": 0.3492, + "step": 1051, + "task_loss": 0.5244699120521545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22081968188285828, + "epoch": 0.89, + "learning_rate": 4.555367709213863e-05, + "loss": 0.292, + "step": 1052, + "task_loss": 0.11215706169605255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46807587146759033, + "epoch": 0.89, + "learning_rate": 4.554945054945055e-05, + "loss": 0.3871, + "step": 1053, + "task_loss": 1.020930290222168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42110878229141235, + "epoch": 0.89, + "learning_rate": 4.554522400676247e-05, + "loss": 0.3234, + "step": 1054, + "task_loss": 0.9853991866111755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15152102708816528, + "epoch": 0.89, + "learning_rate": 4.554099746407439e-05, + "loss": 0.2772, + "step": 1055, + "task_loss": 0.10596860200166702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34711506962776184, + "epoch": 0.89, + "learning_rate": 4.5536770921386304e-05, + "loss": 0.3232, + "step": 1056, + "task_loss": 0.764802873134613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8313543200492859, + "epoch": 0.89, + "learning_rate": 4.5532544378698224e-05, + "loss": 0.4054, + "step": 1057, + "task_loss": 0.738728940486908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2969970703125, + "epoch": 0.89, + "learning_rate": 4.552831783601015e-05, + "loss": 0.3304, + "step": 1058, + "task_loss": 1.0873650312423706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1679776906967163, + "epoch": 0.89, + "learning_rate": 4.5524091293322064e-05, + "loss": 0.335, + "step": 1059, + "task_loss": 0.09169580042362213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25704994797706604, + "epoch": 0.9, + "learning_rate": 4.5519864750633983e-05, + "loss": 0.4206, + "step": 1060, + "task_loss": 1.3041671514511108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3977009057998657, + "epoch": 0.9, + "learning_rate": 4.55156382079459e-05, + "loss": 0.3533, + "step": 1061, + "task_loss": 0.6762240529060364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2623427212238312, + "epoch": 0.9, + "learning_rate": 4.5511411665257816e-05, + "loss": 0.3511, + "step": 1062, + "task_loss": 0.6453579068183899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40080171823501587, + "epoch": 0.9, + "learning_rate": 4.550718512256974e-05, + "loss": 0.3013, + "step": 1063, + "task_loss": 0.42494773864746094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4466301202774048, + "epoch": 0.9, + "learning_rate": 4.550295857988166e-05, + "loss": 0.3897, + "step": 1064, + "task_loss": 1.061437726020813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3910818099975586, + "epoch": 0.9, + "learning_rate": 4.5498732037193575e-05, + "loss": 0.3597, + "step": 1065, + "task_loss": 0.7140308618545532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6939448714256287, + "epoch": 0.9, + "learning_rate": 4.5494505494505495e-05, + "loss": 0.5485, + "step": 1066, + "task_loss": 0.2731471061706543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35204967856407166, + "epoch": 0.9, + "learning_rate": 4.5490278951817415e-05, + "loss": 0.3037, + "step": 1067, + "task_loss": 0.3471311628818512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41570937633514404, + "epoch": 0.9, + "learning_rate": 4.5486052409129335e-05, + "loss": 0.3338, + "step": 1068, + "task_loss": 0.9508390426635742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4666168689727783, + "epoch": 0.9, + "learning_rate": 4.5481825866441254e-05, + "loss": 0.3404, + "step": 1069, + "task_loss": 0.6101741790771484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21288862824440002, + "epoch": 0.9, + "learning_rate": 4.5477599323753174e-05, + "loss": 0.3286, + "step": 1070, + "task_loss": 0.5916988849639893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24179798364639282, + "epoch": 0.9, + "learning_rate": 4.5473372781065094e-05, + "loss": 0.2858, + "step": 1071, + "task_loss": 0.35410770773887634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.534841775894165, + "epoch": 0.91, + "learning_rate": 4.546914623837701e-05, + "loss": 0.399, + "step": 1072, + "task_loss": 1.1390701532363892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2865257263183594, + "epoch": 0.91, + "learning_rate": 4.5464919695688926e-05, + "loss": 0.2767, + "step": 1073, + "task_loss": 0.3366599977016449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6614929437637329, + "epoch": 0.91, + "learning_rate": 4.5460693153000846e-05, + "loss": 0.3562, + "step": 1074, + "task_loss": 0.44420671463012695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27515363693237305, + "epoch": 0.91, + "learning_rate": 4.5456466610312766e-05, + "loss": 0.2986, + "step": 1075, + "task_loss": 0.4002901315689087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3757680654525757, + "epoch": 0.91, + "learning_rate": 4.5452240067624686e-05, + "loss": 0.4197, + "step": 1076, + "task_loss": 0.3707464337348938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19199183583259583, + "epoch": 0.91, + "learning_rate": 4.5448013524936605e-05, + "loss": 0.304, + "step": 1077, + "task_loss": 0.9447035193443298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15268227458000183, + "epoch": 0.91, + "learning_rate": 4.544378698224852e-05, + "loss": 0.2282, + "step": 1078, + "task_loss": 0.5656244158744812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2276354283094406, + "epoch": 0.91, + "learning_rate": 4.543956043956044e-05, + "loss": 0.2688, + "step": 1079, + "task_loss": 0.3784196376800537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2888668179512024, + "epoch": 0.91, + "learning_rate": 4.5435333896872365e-05, + "loss": 0.3501, + "step": 1080, + "task_loss": 0.6107114553451538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5053978562355042, + "epoch": 0.91, + "learning_rate": 4.543110735418428e-05, + "loss": 0.4985, + "step": 1081, + "task_loss": 1.2131242752075195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3194124698638916, + "epoch": 0.91, + "learning_rate": 4.54268808114962e-05, + "loss": 0.306, + "step": 1082, + "task_loss": 0.8277527093887329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18914735317230225, + "epoch": 0.91, + "learning_rate": 4.542265426880812e-05, + "loss": 0.2814, + "step": 1083, + "task_loss": 0.40239810943603516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1989613026380539, + "epoch": 0.92, + "learning_rate": 4.541842772612004e-05, + "loss": 0.3718, + "step": 1084, + "task_loss": 0.23209945857524872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2026151567697525, + "epoch": 0.92, + "learning_rate": 4.5414201183431957e-05, + "loss": 0.286, + "step": 1085, + "task_loss": 0.6754297614097595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2859596312046051, + "epoch": 0.92, + "learning_rate": 4.5409974640743876e-05, + "loss": 0.3311, + "step": 1086, + "task_loss": 0.773571252822876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34478265047073364, + "epoch": 0.92, + "learning_rate": 4.5405748098055796e-05, + "loss": 0.3192, + "step": 1087, + "task_loss": 0.3566049337387085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23188981413841248, + "epoch": 0.92, + "learning_rate": 4.540152155536771e-05, + "loss": 0.3517, + "step": 1088, + "task_loss": 0.2745937705039978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.350466787815094, + "epoch": 0.92, + "learning_rate": 4.539729501267963e-05, + "loss": 0.3144, + "step": 1089, + "task_loss": 0.44054114818573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4395449757575989, + "epoch": 0.92, + "learning_rate": 4.539306846999155e-05, + "loss": 0.3265, + "step": 1090, + "task_loss": 0.77244633436203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17624793946743011, + "epoch": 0.92, + "learning_rate": 4.538884192730347e-05, + "loss": 0.4105, + "step": 1091, + "task_loss": 1.2938034534454346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2849665880203247, + "epoch": 0.92, + "learning_rate": 4.538461538461539e-05, + "loss": 0.3222, + "step": 1092, + "task_loss": 0.6030697822570801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2724902927875519, + "epoch": 0.92, + "learning_rate": 4.538038884192731e-05, + "loss": 0.3462, + "step": 1093, + "task_loss": 0.5950093269348145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6185368895530701, + "epoch": 0.92, + "learning_rate": 4.537616229923922e-05, + "loss": 0.4845, + "step": 1094, + "task_loss": 0.5496288537979126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3195955455303192, + "epoch": 0.93, + "learning_rate": 4.537193575655114e-05, + "loss": 0.309, + "step": 1095, + "task_loss": 0.423796147108078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.197946697473526, + "epoch": 0.93, + "learning_rate": 4.536770921386306e-05, + "loss": 0.2794, + "step": 1096, + "task_loss": 0.1070009246468544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5025796890258789, + "epoch": 0.93, + "learning_rate": 4.536348267117498e-05, + "loss": 0.4463, + "step": 1097, + "task_loss": 0.750395655632019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4005841314792633, + "epoch": 0.93, + "learning_rate": 4.53592561284869e-05, + "loss": 0.4582, + "step": 1098, + "task_loss": 0.8168262243270874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41911229491233826, + "epoch": 0.93, + "learning_rate": 4.535502958579882e-05, + "loss": 0.3319, + "step": 1099, + "task_loss": 0.9958615303039551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4013597071170807, + "epoch": 0.93, + "learning_rate": 4.535080304311074e-05, + "loss": 0.3226, + "step": 1100, + "task_loss": 0.6000455021858215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15728117525577545, + "epoch": 0.93, + "learning_rate": 4.534657650042265e-05, + "loss": 0.3644, + "step": 1101, + "task_loss": 0.3147355318069458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2503385543823242, + "epoch": 0.93, + "learning_rate": 4.534234995773458e-05, + "loss": 0.2453, + "step": 1102, + "task_loss": 0.3066632151603699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17960643768310547, + "epoch": 0.93, + "learning_rate": 4.53381234150465e-05, + "loss": 0.2847, + "step": 1103, + "task_loss": 0.02352246083319187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4108585715293884, + "epoch": 0.93, + "learning_rate": 4.533389687235841e-05, + "loss": 0.2944, + "step": 1104, + "task_loss": 0.7071190476417542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2555324137210846, + "epoch": 0.93, + "learning_rate": 4.532967032967033e-05, + "loss": 0.3536, + "step": 1105, + "task_loss": 0.9082435369491577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.412306547164917, + "epoch": 0.93, + "learning_rate": 4.532544378698225e-05, + "loss": 0.3878, + "step": 1106, + "task_loss": 0.6728000044822693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3466845750808716, + "epoch": 0.94, + "learning_rate": 4.532121724429417e-05, + "loss": 0.3573, + "step": 1107, + "task_loss": 0.42230331897735596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4577171206474304, + "epoch": 0.94, + "learning_rate": 4.531699070160609e-05, + "loss": 0.2847, + "step": 1108, + "task_loss": 0.49863535165786743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22043421864509583, + "epoch": 0.94, + "learning_rate": 4.531276415891801e-05, + "loss": 0.3771, + "step": 1109, + "task_loss": 0.2066926658153534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24291947484016418, + "epoch": 0.94, + "learning_rate": 4.530853761622992e-05, + "loss": 0.3067, + "step": 1110, + "task_loss": 0.34979933500289917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3833697736263275, + "epoch": 0.94, + "learning_rate": 4.530431107354184e-05, + "loss": 0.3468, + "step": 1111, + "task_loss": 0.5778905749320984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27140769362449646, + "epoch": 0.94, + "learning_rate": 4.530008453085376e-05, + "loss": 0.3597, + "step": 1112, + "task_loss": 0.7911396622657776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2681792378425598, + "epoch": 0.94, + "learning_rate": 4.529585798816568e-05, + "loss": 0.4337, + "step": 1113, + "task_loss": 0.4078145921230316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35279330611228943, + "epoch": 0.94, + "learning_rate": 4.52916314454776e-05, + "loss": 0.3911, + "step": 1114, + "task_loss": 0.5029667615890503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23784451186656952, + "epoch": 0.94, + "learning_rate": 4.528740490278952e-05, + "loss": 0.311, + "step": 1115, + "task_loss": 0.06948636472225189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25149303674697876, + "epoch": 0.94, + "learning_rate": 4.528317836010144e-05, + "loss": 0.2771, + "step": 1116, + "task_loss": 0.418526291847229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2822594940662384, + "epoch": 0.94, + "learning_rate": 4.5278951817413354e-05, + "loss": 0.3637, + "step": 1117, + "task_loss": 0.5836882591247559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33597350120544434, + "epoch": 0.94, + "learning_rate": 4.5274725274725274e-05, + "loss": 0.2707, + "step": 1118, + "task_loss": 0.2719374895095825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1671154946088791, + "epoch": 0.95, + "learning_rate": 4.52704987320372e-05, + "loss": 0.2925, + "step": 1119, + "task_loss": 0.4273891746997833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18667495250701904, + "epoch": 0.95, + "learning_rate": 4.5266272189349114e-05, + "loss": 0.3111, + "step": 1120, + "task_loss": 0.15779000520706177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3783668279647827, + "epoch": 0.95, + "learning_rate": 4.526204564666103e-05, + "loss": 0.3265, + "step": 1121, + "task_loss": 1.1066042184829712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22718198597431183, + "epoch": 0.95, + "learning_rate": 4.525781910397295e-05, + "loss": 0.2758, + "step": 1122, + "task_loss": 0.3243577182292938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2897654175758362, + "epoch": 0.95, + "learning_rate": 4.5253592561284866e-05, + "loss": 0.3091, + "step": 1123, + "task_loss": 0.3035680651664734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1746860146522522, + "epoch": 0.95, + "learning_rate": 4.524936601859679e-05, + "loss": 0.2814, + "step": 1124, + "task_loss": 0.8144105076789856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29736846685409546, + "epoch": 0.95, + "learning_rate": 4.524513947590871e-05, + "loss": 0.4041, + "step": 1125, + "task_loss": 0.43958353996276855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.345153272151947, + "epoch": 0.95, + "learning_rate": 4.5240912933220625e-05, + "loss": 0.3786, + "step": 1126, + "task_loss": 0.6712506413459778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28389960527420044, + "epoch": 0.95, + "learning_rate": 4.5236686390532545e-05, + "loss": 0.336, + "step": 1127, + "task_loss": 1.4295475482940674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2105841338634491, + "epoch": 0.95, + "learning_rate": 4.5232459847844465e-05, + "loss": 0.3519, + "step": 1128, + "task_loss": 0.8832420110702515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3502531945705414, + "epoch": 0.95, + "learning_rate": 4.5228233305156384e-05, + "loss": 0.3231, + "step": 1129, + "task_loss": 1.0557340383529663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35527557134628296, + "epoch": 0.95, + "learning_rate": 4.5224006762468304e-05, + "loss": 0.3625, + "step": 1130, + "task_loss": 0.874673068523407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14178301393985748, + "epoch": 0.96, + "learning_rate": 4.5219780219780224e-05, + "loss": 0.3108, + "step": 1131, + "task_loss": 0.10780816525220871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32874321937561035, + "epoch": 0.96, + "learning_rate": 4.5215553677092144e-05, + "loss": 0.3829, + "step": 1132, + "task_loss": 0.1389910876750946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20046493411064148, + "epoch": 0.96, + "learning_rate": 4.5211327134404057e-05, + "loss": 0.371, + "step": 1133, + "task_loss": 0.6660134792327881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3007514476776123, + "epoch": 0.96, + "learning_rate": 4.5207100591715976e-05, + "loss": 0.3175, + "step": 1134, + "task_loss": 0.29382359981536865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4086291790008545, + "epoch": 0.96, + "learning_rate": 4.5202874049027896e-05, + "loss": 0.4597, + "step": 1135, + "task_loss": 1.5114408731460571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26803046464920044, + "epoch": 0.96, + "learning_rate": 4.5198647506339816e-05, + "loss": 0.3112, + "step": 1136, + "task_loss": 0.731511116027832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3154323697090149, + "epoch": 0.96, + "learning_rate": 4.5194420963651736e-05, + "loss": 0.3292, + "step": 1137, + "task_loss": 0.8449059724807739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38997378945350647, + "epoch": 0.96, + "learning_rate": 4.5190194420963655e-05, + "loss": 0.3196, + "step": 1138, + "task_loss": 0.3029818832874298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23375770449638367, + "epoch": 0.96, + "learning_rate": 4.518596787827557e-05, + "loss": 0.3731, + "step": 1139, + "task_loss": 0.8964962959289551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.544218897819519, + "epoch": 0.96, + "learning_rate": 4.518174133558749e-05, + "loss": 0.3918, + "step": 1140, + "task_loss": 0.3147408664226532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5715116858482361, + "epoch": 0.96, + "learning_rate": 4.5177514792899414e-05, + "loss": 0.4012, + "step": 1141, + "task_loss": 0.6181625723838806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5026450753211975, + "epoch": 0.96, + "learning_rate": 4.5173288250211334e-05, + "loss": 0.4475, + "step": 1142, + "task_loss": 0.8529657125473022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22553801536560059, + "epoch": 0.97, + "learning_rate": 4.516906170752325e-05, + "loss": 0.3412, + "step": 1143, + "task_loss": 0.3726401627063751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14779837429523468, + "epoch": 0.97, + "learning_rate": 4.516483516483517e-05, + "loss": 0.2942, + "step": 1144, + "task_loss": 0.22872155904769897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24052079021930695, + "epoch": 0.97, + "learning_rate": 4.516060862214709e-05, + "loss": 0.2718, + "step": 1145, + "task_loss": 0.34861892461776733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23975765705108643, + "epoch": 0.97, + "learning_rate": 4.5156382079459006e-05, + "loss": 0.362, + "step": 1146, + "task_loss": 1.418960690498352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24894945323467255, + "epoch": 0.97, + "learning_rate": 4.5152155536770926e-05, + "loss": 0.273, + "step": 1147, + "task_loss": 0.3433225452899933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2631831765174866, + "epoch": 0.97, + "learning_rate": 4.5147928994082846e-05, + "loss": 0.3382, + "step": 1148, + "task_loss": 0.5902136564254761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39580830931663513, + "epoch": 0.97, + "learning_rate": 4.514370245139476e-05, + "loss": 0.3485, + "step": 1149, + "task_loss": 1.0852302312850952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35671716928482056, + "epoch": 0.97, + "learning_rate": 4.513947590870668e-05, + "loss": 0.3003, + "step": 1150, + "task_loss": 0.9312202334403992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37598925828933716, + "epoch": 0.97, + "learning_rate": 4.51352493660186e-05, + "loss": 0.2755, + "step": 1151, + "task_loss": 0.870559573173523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35334280133247375, + "epoch": 0.97, + "learning_rate": 4.513102282333052e-05, + "loss": 0.3073, + "step": 1152, + "task_loss": 0.2768174111843109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2127104252576828, + "epoch": 0.97, + "learning_rate": 4.512679628064244e-05, + "loss": 0.3253, + "step": 1153, + "task_loss": 0.20750877261161804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.12464627623558044, + "epoch": 0.97, + "learning_rate": 4.512256973795436e-05, + "loss": 0.3133, + "step": 1154, + "task_loss": 0.5762190222740173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3354787230491638, + "epoch": 0.98, + "learning_rate": 4.511834319526627e-05, + "loss": 0.4139, + "step": 1155, + "task_loss": 0.9044637680053711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22063955664634705, + "epoch": 0.98, + "learning_rate": 4.511411665257819e-05, + "loss": 0.3418, + "step": 1156, + "task_loss": 0.23841185867786407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15205956995487213, + "epoch": 0.98, + "learning_rate": 4.510989010989011e-05, + "loss": 0.2575, + "step": 1157, + "task_loss": 0.39559870958328247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4291304349899292, + "epoch": 0.98, + "learning_rate": 4.5105663567202036e-05, + "loss": 0.2708, + "step": 1158, + "task_loss": 0.37119153141975403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19470790028572083, + "epoch": 0.98, + "learning_rate": 4.510143702451395e-05, + "loss": 0.3946, + "step": 1159, + "task_loss": 0.43265652656555176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3006949722766876, + "epoch": 0.98, + "learning_rate": 4.509721048182587e-05, + "loss": 0.3647, + "step": 1160, + "task_loss": 0.9573840498924255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23114867508411407, + "epoch": 0.98, + "learning_rate": 4.509298393913779e-05, + "loss": 0.2867, + "step": 1161, + "task_loss": 0.268611341714859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24773582816123962, + "epoch": 0.98, + "learning_rate": 4.50887573964497e-05, + "loss": 0.3458, + "step": 1162, + "task_loss": 0.718194842338562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5117859244346619, + "epoch": 0.98, + "learning_rate": 4.508453085376163e-05, + "loss": 0.4148, + "step": 1163, + "task_loss": 1.3371446132659912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3454076051712036, + "epoch": 0.98, + "learning_rate": 4.508030431107355e-05, + "loss": 0.2876, + "step": 1164, + "task_loss": 0.7379355430603027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.362413227558136, + "epoch": 0.98, + "learning_rate": 4.507607776838546e-05, + "loss": 0.2879, + "step": 1165, + "task_loss": 0.3431224822998047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29363489151000977, + "epoch": 0.99, + "learning_rate": 4.507185122569738e-05, + "loss": 0.2822, + "step": 1166, + "task_loss": 0.4833914041519165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2251429408788681, + "epoch": 0.99, + "learning_rate": 4.50676246830093e-05, + "loss": 0.2479, + "step": 1167, + "task_loss": 0.2708614766597748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2700488567352295, + "epoch": 0.99, + "learning_rate": 4.5063398140321214e-05, + "loss": 0.3066, + "step": 1168, + "task_loss": 0.9844657182693481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6104203462600708, + "epoch": 0.99, + "learning_rate": 4.505917159763314e-05, + "loss": 0.376, + "step": 1169, + "task_loss": 0.8192672729492188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21359509229660034, + "epoch": 0.99, + "learning_rate": 4.505494505494506e-05, + "loss": 0.2018, + "step": 1170, + "task_loss": 0.15554118156433105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16944999992847443, + "epoch": 0.99, + "learning_rate": 4.505071851225698e-05, + "loss": 0.2056, + "step": 1171, + "task_loss": 0.14514167606830597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3679329752922058, + "epoch": 0.99, + "learning_rate": 4.504649196956889e-05, + "loss": 0.2941, + "step": 1172, + "task_loss": 0.944209098815918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3078688979148865, + "epoch": 0.99, + "learning_rate": 4.504226542688081e-05, + "loss": 0.311, + "step": 1173, + "task_loss": 0.24003413319587708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21030209958553314, + "epoch": 0.99, + "learning_rate": 4.503803888419273e-05, + "loss": 0.3264, + "step": 1174, + "task_loss": 0.37533411383628845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2800752520561218, + "epoch": 0.99, + "learning_rate": 4.503381234150465e-05, + "loss": 0.3553, + "step": 1175, + "task_loss": 1.0056970119476318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6683984398841858, + "epoch": 0.99, + "learning_rate": 4.502958579881657e-05, + "loss": 0.4299, + "step": 1176, + "task_loss": 0.6065475940704346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15709534287452698, + "epoch": 0.99, + "learning_rate": 4.502535925612849e-05, + "loss": 0.2601, + "step": 1177, + "task_loss": 0.42998674511909485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19763796031475067, + "epoch": 1.0, + "learning_rate": 4.5021132713440404e-05, + "loss": 0.3497, + "step": 1178, + "task_loss": 0.46407046914100647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36419183015823364, + "epoch": 1.0, + "learning_rate": 4.5016906170752324e-05, + "loss": 0.3657, + "step": 1179, + "task_loss": 1.5342886447906494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22747506201267242, + "epoch": 1.0, + "learning_rate": 4.501267962806425e-05, + "loss": 0.2932, + "step": 1180, + "task_loss": 0.34386053681373596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23093140125274658, + "epoch": 1.0, + "learning_rate": 4.500845308537616e-05, + "loss": 0.2518, + "step": 1181, + "task_loss": 1.2779991626739502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19193525612354279, + "epoch": 1.0, + "learning_rate": 4.500422654268808e-05, + "loss": 0.3346, + "step": 1182, + "task_loss": 0.13631373643875122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20304660499095917, + "epoch": 1.0, + "learning_rate": 4.5e-05, + "loss": 0.3005, + "step": 1183, + "task_loss": 0.3247835338115692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -0.0047100442461669445, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010019401097710254, + "compression/movement_sparsity/model_sparsity": 0.0009675203846720848, + "compression_loss": 0.0, + "distillation_loss": 0.3937605023384094, + "epoch": 1.0, + "learning_rate": 4.4995773457311916e-05, + "loss": 0.5788, + "step": 1184, + "task_loss": 0.9173633456230164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0037974665492690463, + "compression/movement_sparsity/importance_threshold": -0.004698120089186681, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010110978705153682, + "compression/movement_sparsity/model_sparsity": 0.0009763635481622922, + "compression_loss": 0.4102906286716461, + "distillation_loss": 0.3808823823928833, + "epoch": 1.0, + "learning_rate": 4.4991546914623836e-05, + "loss": 0.7763, + "step": 1185, + "task_loss": 0.8962676525115967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007588518460622751, + "compression/movement_sparsity/importance_threshold": -0.004686216074358686, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010141504574301492, + "compression/movement_sparsity/model_sparsity": 0.0009793112693256946, + "compression_loss": 0.8198877573013306, + "distillation_loss": 0.3202807903289795, + "epoch": 1.0, + "learning_rate": 4.498732037193576e-05, + "loss": 1.1626, + "step": 1186, + "task_loss": 0.41993218660354614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011373161156410516, + "compression/movement_sparsity/importance_threshold": -0.004674332184656624, + "compression/movement_sparsity/linear_layer_sparsity": 0.001023308218174492, + "compression/movement_sparsity/model_sparsity": 0.000988154432815902, + "compression_loss": 1.2287936210632324, + "distillation_loss": 0.44650501012802124, + "epoch": 1.0, + "learning_rate": 4.498309382924768e-05, + "loss": 1.5534, + "step": 1187, + "task_loss": 0.715635359287262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015151400058979747, + "compression/movement_sparsity/importance_threshold": -0.004662468403054164, + "compression/movement_sparsity/linear_layer_sparsity": 0.001037056783458642, + "compression/movement_sparsity/model_sparsity": 0.0010014306925870075, + "compression_loss": 1.63700532913208, + "distillation_loss": 0.6858876943588257, + "epoch": 1.0, + "learning_rate": 4.4978867286559595e-05, + "loss": 2.1211, + "step": 1188, + "task_loss": 0.6914085149765015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018923240590679014, + "compression/movement_sparsity/importance_threshold": -0.004650624712524971, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010706352395212322, + "compression/movement_sparsity/model_sparsity": 0.0010338556253844343, + "compression_loss": 2.044524908065796, + "distillation_loss": 0.3964264690876007, + "epoch": 1.01, + "learning_rate": 4.4974640743871514e-05, + "loss": 2.3648, + "step": 1189, + "task_loss": 0.3107689917087555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022688688173855553, + "compression/movement_sparsity/importance_threshold": -0.004638801096042715, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011042136955838225, + "compression/movement_sparsity/model_sparsity": 0.0010662805581818612, + "compression_loss": 2.4513514041900635, + "distillation_loss": 0.481489896774292, + "epoch": 1.01, + "learning_rate": 4.4970414201183434e-05, + "loss": 2.8931, + "step": 1190, + "task_loss": 0.24588356912136078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026447748230858936, + "compression/movement_sparsity/importance_threshold": -0.004626997536581058, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011133714563281653, + "compression/movement_sparsity/model_sparsity": 0.0010751237216720686, + "compression_loss": 2.857487678527832, + "distillation_loss": 0.2723100781440735, + "epoch": 1.01, + "learning_rate": 4.4966187658495354e-05, + "loss": 3.1045, + "step": 1191, + "task_loss": 0.393263578414917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030200426184036733, + "compression/movement_sparsity/importance_threshold": -0.00461521401711367, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011486908408655915, + "compression/movement_sparsity/model_sparsity": 0.0011092297766954985, + "compression_loss": 3.262927770614624, + "distillation_loss": 0.26160818338394165, + "epoch": 1.01, + "learning_rate": 4.4961961115807274e-05, + "loss": 3.6934, + "step": 1192, + "task_loss": 0.5637128949165344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.033946727455736514, + "compression/movement_sparsity/importance_threshold": -0.004603450520614219, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011914389818401603, + "compression/movement_sparsity/model_sparsity": 0.0011505093875189272, + "compression_loss": 3.6676766872406006, + "distillation_loss": 0.34008169174194336, + "epoch": 1.01, + "learning_rate": 4.4957734573119193e-05, + "loss": 4.0452, + "step": 1193, + "task_loss": 0.8559937477111816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03768665746830768, + "compression/movement_sparsity/importance_threshold": -0.004591707030056367, + "compression/movement_sparsity/linear_layer_sparsity": 0.001225005513735115, + "compression/movement_sparsity/model_sparsity": 0.0011829228057805596, + "compression_loss": 4.071734428405762, + "distillation_loss": 0.30722981691360474, + "epoch": 1.01, + "learning_rate": 4.4953508030431106e-05, + "loss": 4.4792, + "step": 1194, + "task_loss": 1.1641637086868286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04142022164409781, + "compression/movement_sparsity/importance_threshold": -0.0045799835284137825, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012463855463062173, + "compression/movement_sparsity/model_sparsity": 0.0012035683684601711, + "compression_loss": 4.475098133087158, + "distillation_loss": 0.17917898297309875, + "epoch": 1.01, + "learning_rate": 4.4949281487743026e-05, + "loss": 4.8255, + "step": 1195, + "task_loss": 0.061687059700489044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04514742540545447, + "compression/movement_sparsity/importance_threshold": -0.004568279998660136, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012891217631131502, + "compression/movement_sparsity/model_sparsity": 0.0012448364647478054, + "compression_loss": 4.877766132354736, + "distillation_loss": 0.24638590216636658, + "epoch": 1.01, + "learning_rate": 4.4945054945054946e-05, + "loss": 5.2702, + "step": 1196, + "task_loss": 0.36251261830329895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04886827417472722, + "compression/movement_sparsity/importance_threshold": -0.00455659642376909, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012830285134512243, + "compression/movement_sparsity/model_sparsity": 0.001238952536956795, + "compression_loss": 5.279744625091553, + "distillation_loss": 0.44648393988609314, + "epoch": 1.01, + "learning_rate": 4.4940828402366866e-05, + "loss": 5.6236, + "step": 1197, + "task_loss": 0.6393554210662842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05258277337426348, + "compression/movement_sparsity/importance_threshold": -0.004544932786714311, + "compression/movement_sparsity/linear_layer_sparsity": 0.0013074492087694718, + "compression/movement_sparsity/model_sparsity": 0.0012625343062640146, + "compression_loss": 5.681037425994873, + "distillation_loss": 0.2602553963661194, + "epoch": 1.01, + "learning_rate": 4.4936601859678785e-05, + "loss": 5.9757, + "step": 1198, + "task_loss": 0.5741052031517029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05629092842641148, + "compression/movement_sparsity/importance_threshold": -0.004533289070469468, + "compression/movement_sparsity/linear_layer_sparsity": 0.001329008103855112, + "compression/movement_sparsity/model_sparsity": 0.0012833525869805442, + "compression_loss": 6.081630706787109, + "distillation_loss": 0.1991117298603058, + "epoch": 1.01, + "learning_rate": 4.4932375316990705e-05, + "loss": 6.3822, + "step": 1199, + "task_loss": 1.0478651523590088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059992744753518956, + "compression/movement_sparsity/importance_threshold": -0.004521665258008229, + "compression/movement_sparsity/linear_layer_sparsity": 0.0013919915573077405, + "compression/movement_sparsity/model_sparsity": 0.0013441723650473088, + "compression_loss": 6.481531620025635, + "distillation_loss": 0.3642633855342865, + "epoch": 1.01, + "learning_rate": 4.4928148774302625e-05, + "loss": 6.8904, + "step": 1200, + "task_loss": 0.6244286298751831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06368822777793498, + "compression/movement_sparsity/importance_threshold": -0.004510061332304256, + "compression/movement_sparsity/linear_layer_sparsity": 0.001468902438559057, + "compression/movement_sparsity/model_sparsity": 0.0014184411209220969, + "compression_loss": 6.880734443664551, + "distillation_loss": 0.23397861421108246, + "epoch": 1.02, + "learning_rate": 4.492392223161454e-05, + "loss": 7.206, + "step": 1201, + "task_loss": 0.36994272470474243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06737738292200746, + "compression/movement_sparsity/importance_threshold": -0.004498477276331219, + "compression/movement_sparsity/linear_layer_sparsity": 0.0015038402497321356, + "compression/movement_sparsity/model_sparsity": 0.0014521787108001014, + "compression_loss": 7.2792439460754395, + "distillation_loss": 0.30487754940986633, + "epoch": 1.02, + "learning_rate": 4.491969568892646e-05, + "loss": 7.6558, + "step": 1202, + "task_loss": 0.596557080745697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07106021560808395, + "compression/movement_sparsity/importance_threshold": -0.004486913073062785, + "compression/movement_sparsity/linear_layer_sparsity": 0.0015490686175749638, + "compression/movement_sparsity/model_sparsity": 0.0014958533450687946, + "compression_loss": 7.67705774307251, + "distillation_loss": 0.20317770540714264, + "epoch": 1.02, + "learning_rate": 4.4915469146238384e-05, + "loss": 8.0139, + "step": 1203, + "task_loss": 0.9098516702651978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07473673125851371, + "compression/movement_sparsity/importance_threshold": -0.004475368705472619, + "compression/movement_sparsity/linear_layer_sparsity": 0.0015843760779447544, + "compression/movement_sparsity/model_sparsity": 0.00152994788555643, + "compression_loss": 8.07417106628418, + "distillation_loss": 0.2673817574977875, + "epoch": 1.02, + "learning_rate": 4.49112426035503e-05, + "loss": 8.4658, + "step": 1204, + "task_loss": 0.8123477101325989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07840693529564446, + "compression/movement_sparsity/importance_threshold": -0.0044638441565343885, + "compression/movement_sparsity/linear_layer_sparsity": 0.0016935179843158084, + "compression/movement_sparsity/model_sparsity": 0.001635340431683862, + "compression_loss": 8.470596313476562, + "distillation_loss": 0.3441730737686157, + "epoch": 1.02, + "learning_rate": 4.490701606086222e-05, + "loss": 8.7867, + "step": 1205, + "task_loss": 0.21049267053604126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08207083314182378, + "compression/movement_sparsity/importance_threshold": -0.004452339409221761, + "compression/movement_sparsity/linear_layer_sparsity": 0.0017871942452631483, + "compression/movement_sparsity/model_sparsity": 0.0017257986248857744, + "compression_loss": 8.866316795349121, + "distillation_loss": 0.3460751175880432, + "epoch": 1.02, + "learning_rate": 4.4902789518174136e-05, + "loss": 9.2504, + "step": 1206, + "task_loss": 0.3923206031322479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08572843021940091, + "compression/movement_sparsity/importance_threshold": -0.0044408544465084025, + "compression/movement_sparsity/linear_layer_sparsity": 0.001922652789606552, + "compression/movement_sparsity/model_sparsity": 0.0018566037515117578, + "compression_loss": 9.26134967803955, + "distillation_loss": 0.2772218585014343, + "epoch": 1.02, + "learning_rate": 4.489856297548605e-05, + "loss": 9.5127, + "step": 1207, + "task_loss": 0.49269193410873413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08937973195072357, + "compression/movement_sparsity/importance_threshold": -0.004429389251367978, + "compression/movement_sparsity/linear_layer_sparsity": 0.00202417515285829, + "compression/movement_sparsity/model_sparsity": 0.0019546385092664782, + "compression_loss": 9.655686378479004, + "distillation_loss": 0.305117666721344, + "epoch": 1.02, + "learning_rate": 4.4894336432797976e-05, + "loss": 9.986, + "step": 1208, + "task_loss": 1.6477200984954834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09302474375814002, + "compression/movement_sparsity/importance_threshold": -0.004417943806774157, + "compression/movement_sparsity/linear_layer_sparsity": 0.002192055509003605, + "compression/movement_sparsity/model_sparsity": 0.002116751658717818, + "compression_loss": 10.049321174621582, + "distillation_loss": 0.2753101587295532, + "epoch": 1.02, + "learning_rate": 4.4890109890109896e-05, + "loss": 10.3471, + "step": 1209, + "task_loss": 0.9186722636222839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09666347106399781, + "compression/movement_sparsity/importance_threshold": -0.004406518095700606, + "compression/movement_sparsity/linear_layer_sparsity": 0.0023940866812580323, + "compression/movement_sparsity/model_sparsity": 0.002311842438684723, + "compression_loss": 10.442254066467285, + "distillation_loss": 0.4144275188446045, + "epoch": 1.02, + "learning_rate": 4.488588334742181e-05, + "loss": 10.8568, + "step": 1210, + "task_loss": 0.8719772696495056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10029591929064618, + "compression/movement_sparsity/importance_threshold": -0.00439511210112099, + "compression/movement_sparsity/linear_layer_sparsity": 0.0025583659387773167, + "compression/movement_sparsity/model_sparsity": 0.002470478198326111, + "compression_loss": 10.834500312805176, + "distillation_loss": 0.4396495819091797, + "epoch": 1.02, + "learning_rate": 4.488165680473373e-05, + "loss": 11.262, + "step": 1211, + "task_loss": 0.9327820539474487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10392209386043288, + "compression/movement_sparsity/importance_threshold": -0.0043837258060089755, + "compression/movement_sparsity/linear_layer_sparsity": 0.002723241404678395, + "compression/movement_sparsity/model_sparsity": 0.0026296896847572268, + "compression_loss": 11.226054191589355, + "distillation_loss": 0.4338788688182831, + "epoch": 1.02, + "learning_rate": 4.487743026204565e-05, + "loss": 11.5975, + "step": 1212, + "task_loss": 0.75318443775177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10754200019570564, + "compression/movement_sparsity/importance_threshold": -0.004372359193338233, + "compression/movement_sparsity/linear_layer_sparsity": 0.002939414639748955, + "compression/movement_sparsity/model_sparsity": 0.002838436704176457, + "compression_loss": 11.616912841796875, + "distillation_loss": 0.20085662603378296, + "epoch": 1.03, + "learning_rate": 4.487320371935757e-05, + "loss": 11.8302, + "step": 1213, + "task_loss": 0.13112294673919678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.11115564371881353, + "compression/movement_sparsity/importance_threshold": -0.0043610122460824245, + "compression/movement_sparsity/linear_layer_sparsity": 0.0032813043742044197, + "compression/movement_sparsity/model_sparsity": 0.0031685814744775304, + "compression_loss": 12.007067680358887, + "distillation_loss": 0.2116384506225586, + "epoch": 1.03, + "learning_rate": 4.486897717666949e-05, + "loss": 12.3508, + "step": 1214, + "task_loss": 0.383533775806427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.11476302985210429, + "compression/movement_sparsity/importance_threshold": -0.004349684947215218, + "compression/movement_sparsity/linear_layer_sparsity": 0.003593240599558596, + "compression/movement_sparsity/model_sparsity": 0.003469801730862718, + "compression_loss": 12.396533012390137, + "distillation_loss": 0.2002447545528412, + "epoch": 1.03, + "learning_rate": 4.486475063398141e-05, + "loss": 12.7334, + "step": 1215, + "task_loss": 0.9297578930854797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.11836416401792565, + "compression/movement_sparsity/importance_threshold": -0.004338377279710284, + "compression/movement_sparsity/linear_layer_sparsity": 0.0038534736340436705, + "compression/movement_sparsity/model_sparsity": 0.003721094960042776, + "compression_loss": 12.785306930541992, + "distillation_loss": 0.30430400371551514, + "epoch": 1.03, + "learning_rate": 4.486052409129333e-05, + "loss": 13.2453, + "step": 1216, + "task_loss": 0.3592604398727417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12195905163862686, + "compression/movement_sparsity/importance_threshold": -0.004327089226541283, + "compression/movement_sparsity/linear_layer_sparsity": 0.0041488114180487255, + "compression/movement_sparsity/model_sparsity": 0.004006286982601963, + "compression_loss": 13.173396110534668, + "distillation_loss": 0.24041709303855896, + "epoch": 1.03, + "learning_rate": 4.485629754860524e-05, + "loss": 13.4677, + "step": 1217, + "task_loss": 0.45772600173950195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1255476981365553, + "compression/movement_sparsity/importance_threshold": -0.004315820770681887, + "compression/movement_sparsity/linear_layer_sparsity": 0.004499882761583804, + "compression/movement_sparsity/model_sparsity": 0.004345297945464833, + "compression_loss": 13.560797691345215, + "distillation_loss": 0.21942958235740662, + "epoch": 1.03, + "learning_rate": 4.485207100591716e-05, + "loss": 13.8709, + "step": 1218, + "task_loss": 0.7078566551208496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12913010893405957, + "compression/movement_sparsity/importance_threshold": -0.004304571895105759, + "compression/movement_sparsity/linear_layer_sparsity": 0.004916179302087054, + "compression/movement_sparsity/model_sparsity": 0.004747293419123841, + "compression_loss": 13.94749927520752, + "distillation_loss": 0.3022770881652832, + "epoch": 1.03, + "learning_rate": 4.484784446322908e-05, + "loss": 14.2599, + "step": 1219, + "task_loss": 0.40298476815223694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1327062894534874, + "compression/movement_sparsity/importance_threshold": -0.004293342582786569, + "compression/movement_sparsity/linear_layer_sparsity": 0.005411473453177895, + "compression/movement_sparsity/model_sparsity": 0.005225572692421682, + "compression_loss": 14.333516120910645, + "distillation_loss": 0.1827034056186676, + "epoch": 1.03, + "learning_rate": 4.4843617920541e-05, + "loss": 14.6496, + "step": 1220, + "task_loss": 0.6569167971611023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1362762451171875, + "compression/movement_sparsity/importance_threshold": -0.004282132816697981, + "compression/movement_sparsity/linear_layer_sparsity": 0.005854289342503305, + "compression/movement_sparsity/model_sparsity": 0.005653176493687749, + "compression_loss": 14.718843460083008, + "distillation_loss": 0.31411004066467285, + "epoch": 1.03, + "learning_rate": 4.483939137785292e-05, + "loss": 15.0359, + "step": 1221, + "task_loss": 1.0844365358352661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.13983998134750797, + "compression/movement_sparsity/importance_threshold": -0.004270942579813663, + "compression/movement_sparsity/linear_layer_sparsity": 0.006360076761113706, + "compression/movement_sparsity/model_sparsity": 0.006141588558484785, + "compression_loss": 15.103476524353027, + "distillation_loss": 0.2729410231113434, + "epoch": 1.03, + "learning_rate": 4.483516483516484e-05, + "loss": 15.4742, + "step": 1222, + "task_loss": 0.5998313426971436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14339750356679637, + "compression/movement_sparsity/importance_threshold": -0.004259771855107283, + "compression/movement_sparsity/linear_layer_sparsity": 0.006962223378389516, + "compression/movement_sparsity/model_sparsity": 0.006723049587037506, + "compression_loss": 15.48742961883545, + "distillation_loss": 0.374523401260376, + "epoch": 1.03, + "learning_rate": 4.483093829247675e-05, + "loss": 15.8628, + "step": 1223, + "task_loss": 0.2658517062664032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14694881719740194, + "compression/movement_sparsity/importance_threshold": -0.004248620625552504, + "compression/movement_sparsity/linear_layer_sparsity": 0.007452366289061665, + "compression/movement_sparsity/model_sparsity": 0.007196354580872106, + "compression_loss": 15.870694160461426, + "distillation_loss": 0.45831477642059326, + "epoch": 1.03, + "learning_rate": 4.482671174978867e-05, + "loss": 16.2698, + "step": 1224, + "task_loss": 1.0425890684127808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.15049392766167224, + "compression/movement_sparsity/importance_threshold": -0.0042374888741229955, + "compression/movement_sparsity/linear_layer_sparsity": 0.008002809715468373, + "compression/movement_sparsity/model_sparsity": 0.007727888582219698, + "compression_loss": 16.253276824951172, + "distillation_loss": 0.2757789194583893, + "epoch": 1.04, + "learning_rate": 4.48224852071006e-05, + "loss": 16.5561, + "step": 1225, + "task_loss": 0.605686366558075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1540328403819557, + "compression/movement_sparsity/importance_threshold": -0.004226376583792424, + "compression/movement_sparsity/linear_layer_sparsity": 0.008642732095814598, + "compression/movement_sparsity/model_sparsity": 0.008345827660169525, + "compression_loss": 16.635168075561523, + "distillation_loss": 0.3095744848251343, + "epoch": 1.04, + "learning_rate": 4.481825866441251e-05, + "loss": 16.9931, + "step": 1226, + "task_loss": 0.15681417286396027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.15756556078059986, + "compression/movement_sparsity/importance_threshold": -0.0042152837375344565, + "compression/movement_sparsity/linear_layer_sparsity": 0.009174275716518463, + "compression/movement_sparsity/model_sparsity": 0.00885911112228277, + "compression_loss": 17.016376495361328, + "distillation_loss": 0.23931270837783813, + "epoch": 1.04, + "learning_rate": 4.481403212172443e-05, + "loss": 17.3445, + "step": 1227, + "task_loss": 0.47263094782829285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1610920942799538, + "compression/movement_sparsity/importance_threshold": -0.004204210318322759, + "compression/movement_sparsity/linear_layer_sparsity": 0.009869681248874359, + "compression/movement_sparsity/model_sparsity": 0.009530627335284595, + "compression_loss": 17.39688491821289, + "distillation_loss": 0.48166126012802124, + "epoch": 1.04, + "learning_rate": 4.480980557903635e-05, + "loss": 17.8271, + "step": 1228, + "task_loss": 1.272774338722229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.16461244630236543, + "compression/movement_sparsity/importance_threshold": -0.004193156309130997, + "compression/movement_sparsity/linear_layer_sparsity": 0.010668123513771745, + "compression/movement_sparsity/model_sparsity": 0.010301640652087047, + "compression_loss": 17.776710510253906, + "distillation_loss": 0.2593509554862976, + "epoch": 1.04, + "learning_rate": 4.480557903634827e-05, + "loss": 18.0733, + "step": 1229, + "task_loss": 0.23353984951972961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1681266222701825, + "compression/movement_sparsity/importance_threshold": -0.00418212169293284, + "compression/movement_sparsity/linear_layer_sparsity": 0.011291614391115751, + "compression/movement_sparsity/model_sparsity": 0.010903712699711994, + "compression_loss": 18.1558837890625, + "distillation_loss": 0.46332481503486633, + "epoch": 1.04, + "learning_rate": 4.480135249366019e-05, + "loss": 18.6271, + "step": 1230, + "task_loss": 0.2720649242401123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17163462760575404, + "compression/movement_sparsity/importance_threshold": -0.004171106452701952, + "compression/movement_sparsity/linear_layer_sparsity": 0.011990084434554063, + "compression/movement_sparsity/model_sparsity": 0.011578188148413016, + "compression_loss": 18.534366607666016, + "distillation_loss": 0.5338602066040039, + "epoch": 1.04, + "learning_rate": 4.479712595097211e-05, + "loss": 19.0769, + "step": 1231, + "task_loss": 0.370920330286026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17513646773142766, + "compression/movement_sparsity/importance_threshold": -0.004160110571412002, + "compression/movement_sparsity/linear_layer_sparsity": 0.012780895232164498, + "compression/movement_sparsity/model_sparsity": 0.012341832162306962, + "compression_loss": 18.912193298339844, + "distillation_loss": 0.8660389184951782, + "epoch": 1.04, + "learning_rate": 4.479289940828403e-05, + "loss": 19.4349, + "step": 1232, + "task_loss": 1.3601912260055542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17863214806955124, + "compression/movement_sparsity/importance_threshold": -0.004149134032036657, + "compression/movement_sparsity/linear_layer_sparsity": 0.013379965414190256, + "compression/movement_sparsity/model_sparsity": 0.012920322440624691, + "compression_loss": 19.289379119873047, + "distillation_loss": 0.4159047603607178, + "epoch": 1.04, + "learning_rate": 4.478867286559594e-05, + "loss": 19.9182, + "step": 1233, + "task_loss": 1.1996432542800903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18212167404247404, + "compression/movement_sparsity/importance_threshold": -0.0041381768175495805, + "compression/movement_sparsity/linear_layer_sparsity": 0.013985701205924461, + "compression/movement_sparsity/model_sparsity": 0.01350524934445157, + "compression_loss": 19.665891647338867, + "distillation_loss": 0.910930871963501, + "epoch": 1.04, + "learning_rate": 4.478444632290786e-05, + "loss": 20.2966, + "step": 1234, + "task_loss": 0.3422808051109314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18560505107254344, + "compression/movement_sparsity/importance_threshold": -0.004127238910924441, + "compression/movement_sparsity/linear_layer_sparsity": 0.01467253326175017, + "compression/movement_sparsity/model_sparsity": 0.01416848660621712, + "compression_loss": 20.04172706604004, + "distillation_loss": 0.5755290985107422, + "epoch": 1.04, + "learning_rate": 4.478021978021978e-05, + "loss": 20.5837, + "step": 1235, + "task_loss": 0.8419200778007507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18908228458210785, + "compression/movement_sparsity/importance_threshold": -0.004116320295134906, + "compression/movement_sparsity/linear_layer_sparsity": 0.015490256905714747, + "compression/movement_sparsity/model_sparsity": 0.014958118927399344, + "compression_loss": 20.416900634765625, + "distillation_loss": 0.6663327813148499, + "epoch": 1.04, + "learning_rate": 4.47759932375317e-05, + "loss": 20.9608, + "step": 1236, + "task_loss": 0.8463159203529358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19255337999351502, + "compression/movement_sparsity/importance_threshold": -0.004105420953154642, + "compression/movement_sparsity/linear_layer_sparsity": 0.016359695664716064, + "compression/movement_sparsity/model_sparsity": 0.01579768979032249, + "compression_loss": 20.791406631469727, + "distillation_loss": 0.5115723609924316, + "epoch": 1.05, + "learning_rate": 4.477176669484362e-05, + "loss": 21.2891, + "step": 1237, + "task_loss": 0.6861749887466431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.196018342729114, + "compression/movement_sparsity/importance_threshold": -0.004094540867957315, + "compression/movement_sparsity/linear_layer_sparsity": 0.017066751108852196, + "compression/movement_sparsity/model_sparsity": 0.01648045570479558, + "compression_loss": 21.165239334106445, + "distillation_loss": 0.4414406418800354, + "epoch": 1.05, + "learning_rate": 4.476754015215554e-05, + "loss": 21.701, + "step": 1238, + "task_loss": 0.6631407141685486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19947717821125255, + "compression/movement_sparsity/importance_threshold": -0.004083680022516592, + "compression/movement_sparsity/linear_layer_sparsity": 0.017810628382647874, + "compression/movement_sparsity/model_sparsity": 0.017198778505802214, + "compression_loss": 21.53841209411621, + "distillation_loss": 0.17983829975128174, + "epoch": 1.05, + "learning_rate": 4.4763313609467454e-05, + "loss": 22.1237, + "step": 1239, + "task_loss": 0.0907113328576088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20292989186227856, + "compression/movement_sparsity/importance_threshold": -0.004072838399806141, + "compression/movement_sparsity/linear_layer_sparsity": 0.018584137213018673, + "compression/movement_sparsity/model_sparsity": 0.01794571492825828, + "compression_loss": 21.910913467407227, + "distillation_loss": 0.4408838152885437, + "epoch": 1.05, + "learning_rate": 4.4759087066779374e-05, + "loss": 22.364, + "step": 1240, + "task_loss": 0.6799322366714478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20637648910454076, + "compression/movement_sparsity/importance_threshold": -0.004062015982799626, + "compression/movement_sparsity/linear_layer_sparsity": 0.01916205392165841, + "compression/movement_sparsity/model_sparsity": 0.018503778420076496, + "compression_loss": 22.282758712768555, + "distillation_loss": 0.3418844938278198, + "epoch": 1.05, + "learning_rate": 4.4754860524091293e-05, + "loss": 22.7301, + "step": 1241, + "task_loss": 0.20166003704071045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20981697536038724, + "compression/movement_sparsity/importance_threshold": -0.004051212754470715, + "compression/movement_sparsity/linear_layer_sparsity": 0.019898681301531482, + "compression/movement_sparsity/model_sparsity": 0.019215100383320048, + "compression_loss": 22.653905868530273, + "distillation_loss": 0.47465944290161133, + "epoch": 1.05, + "learning_rate": 4.475063398140321e-05, + "loss": 23.1505, + "step": 1242, + "task_loss": 0.9753168225288391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.21325135605216555, + "compression/movement_sparsity/importance_threshold": -0.004040428697793077, + "compression/movement_sparsity/linear_layer_sparsity": 0.020610351398542694, + "compression/movement_sparsity/model_sparsity": 0.019902322423145626, + "compression_loss": 23.02440643310547, + "distillation_loss": 0.8669644594192505, + "epoch": 1.05, + "learning_rate": 4.474640743871513e-05, + "loss": 23.6245, + "step": 1243, + "task_loss": 1.547431468963623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.21667963660222495, + "compression/movement_sparsity/importance_threshold": -0.004029663795740375, + "compression/movement_sparsity/linear_layer_sparsity": 0.0214038451338712, + "compression/movement_sparsity/model_sparsity": 0.020668557207593342, + "compression_loss": 23.394229888916016, + "distillation_loss": 0.4561890661716461, + "epoch": 1.05, + "learning_rate": 4.474218089602705e-05, + "loss": 23.8506, + "step": 1244, + "task_loss": 0.7799176573753357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.22010182243291299, + "compression/movement_sparsity/importance_threshold": -0.004018918031286277, + "compression/movement_sparsity/linear_layer_sparsity": 0.022140114788715196, + "compression/movement_sparsity/model_sparsity": 0.02137953373476306, + "compression_loss": 23.763362884521484, + "distillation_loss": 0.5162093639373779, + "epoch": 1.05, + "learning_rate": 4.473795435333897e-05, + "loss": 24.4094, + "step": 1245, + "task_loss": 0.8172413110733032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2235179189665779, + "compression/movement_sparsity/importance_threshold": -0.004008191387404452, + "compression/movement_sparsity/linear_layer_sparsity": 0.023022717828786506, + "compression/movement_sparsity/model_sparsity": 0.02223181664520338, + "compression_loss": 24.1318359375, + "distillation_loss": 0.45048242807388306, + "epoch": 1.05, + "learning_rate": 4.4733727810650885e-05, + "loss": 24.7289, + "step": 1246, + "task_loss": 0.6320061087608337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.22692793162556762, + "compression/movement_sparsity/importance_threshold": -0.003997483847068564, + "compression/movement_sparsity/linear_layer_sparsity": 0.023806863016688492, + "compression/movement_sparsity/model_sparsity": 0.022989024033588178, + "compression_loss": 24.4996337890625, + "distillation_loss": 0.34416663646698, + "epoch": 1.05, + "learning_rate": 4.472950126796281e-05, + "loss": 25.1268, + "step": 1247, + "task_loss": 0.700141191482544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23033186583223103, + "compression/movement_sparsity/importance_threshold": -0.00398679539325228, + "compression/movement_sparsity/linear_layer_sparsity": 0.024782951531024968, + "compression/movement_sparsity/model_sparsity": 0.023931580904657697, + "compression_loss": 24.866764068603516, + "distillation_loss": 0.6046329140663147, + "epoch": 1.05, + "learning_rate": 4.472527472527473e-05, + "loss": 25.4093, + "step": 1248, + "task_loss": 0.9314168691635132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23372972700891603, + "compression/movement_sparsity/importance_threshold": -0.003976126008929267, + "compression/movement_sparsity/linear_layer_sparsity": 0.025749691497934924, + "compression/movement_sparsity/model_sparsity": 0.0248651103796643, + "compression_loss": 25.233219146728516, + "distillation_loss": 0.5717558860778809, + "epoch": 1.06, + "learning_rate": 4.4721048182586645e-05, + "loss": 25.6036, + "step": 1249, + "task_loss": 0.4934878647327423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23712152057797037, + "compression/movement_sparsity/importance_threshold": -0.003965475677073194, + "compression/movement_sparsity/linear_layer_sparsity": 0.026788536906538946, + "compression/movement_sparsity/model_sparsity": 0.025868268252620487, + "compression_loss": 25.599010467529297, + "distillation_loss": 0.4805406928062439, + "epoch": 1.06, + "learning_rate": 4.4716821639898564e-05, + "loss": 26.1047, + "step": 1250, + "task_loss": 0.6133074760437012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24050725196174294, + "compression/movement_sparsity/importance_threshold": -0.003954844380657724, + "compression/movement_sparsity/linear_layer_sparsity": 0.0276484124830963, + "compression/movement_sparsity/model_sparsity": 0.026698604457836412, + "compression_loss": 25.964120864868164, + "distillation_loss": 0.7605350613594055, + "epoch": 1.06, + "learning_rate": 4.4712595097210484e-05, + "loss": 26.4628, + "step": 1251, + "task_loss": 0.9141960144042969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24388692658258165, + "compression/movement_sparsity/importance_threshold": -0.003944232102656525, + "compression/movement_sparsity/linear_layer_sparsity": 0.028573918678321444, + "compression/movement_sparsity/model_sparsity": 0.02759231666806549, + "compression_loss": 26.328554153442383, + "distillation_loss": 0.3859879970550537, + "epoch": 1.06, + "learning_rate": 4.4708368554522404e-05, + "loss": 26.8008, + "step": 1252, + "task_loss": 1.35304594039917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24726054986283408, + "compression/movement_sparsity/importance_threshold": -0.0039336388260432665, + "compression/movement_sparsity/linear_layer_sparsity": 0.029429048436159722, + "compression/movement_sparsity/model_sparsity": 0.02841807008803519, + "compression_loss": 26.692289352416992, + "distillation_loss": 0.5085939764976501, + "epoch": 1.06, + "learning_rate": 4.4704142011834324e-05, + "loss": 27.3367, + "step": 1253, + "task_loss": 1.047324538230896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2506281272248496, + "compression/movement_sparsity/importance_threshold": -0.003923064533791611, + "compression/movement_sparsity/linear_layer_sparsity": 0.030428985285767925, + "compression/movement_sparsity/model_sparsity": 0.029383656030693792, + "compression_loss": 27.055377960205078, + "distillation_loss": 0.5203262567520142, + "epoch": 1.06, + "learning_rate": 4.469991546914624e-05, + "loss": 27.6316, + "step": 1254, + "task_loss": 0.8381924629211426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.25398966409097584, + "compression/movement_sparsity/importance_threshold": -0.003912509208875228, + "compression/movement_sparsity/linear_layer_sparsity": 0.03158801437997381, + "compression/movement_sparsity/model_sparsity": 0.030502868909923157, + "compression_loss": 27.417831420898438, + "distillation_loss": 0.5855239629745483, + "epoch": 1.06, + "learning_rate": 4.4695688926458156e-05, + "loss": 27.9559, + "step": 1255, + "task_loss": 1.0134457349777222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2573451658835608, + "compression/movement_sparsity/importance_threshold": -0.003901972834267782, + "compression/movement_sparsity/linear_layer_sparsity": 0.03247214371350251, + "compression/movement_sparsity/model_sparsity": 0.03135662568094518, + "compression_loss": 27.779600143432617, + "distillation_loss": 0.6592015624046326, + "epoch": 1.06, + "learning_rate": 4.4691462383770076e-05, + "loss": 28.3672, + "step": 1256, + "task_loss": 0.5244863629341125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.26069463802495263, + "compression/movement_sparsity/importance_threshold": -0.0038914553929429428, + "compression/movement_sparsity/linear_layer_sparsity": 0.03370583002127653, + "compression/movement_sparsity/model_sparsity": 0.03254793106878416, + "compression_loss": 28.14071273803711, + "distillation_loss": 0.6236889958381653, + "epoch": 1.06, + "learning_rate": 4.4687235841081996e-05, + "loss": 28.6155, + "step": 1257, + "task_loss": 0.9864320755004883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2640380859375, + "compression/movement_sparsity/importance_threshold": -0.0038809568678743744, + "compression/movement_sparsity/linear_layer_sparsity": 0.03460236048914653, + "compression/movement_sparsity/model_sparsity": 0.033413662957032506, + "compression_loss": 28.501176834106445, + "distillation_loss": 0.4905117154121399, + "epoch": 1.06, + "learning_rate": 4.4683009298393915e-05, + "loss": 28.9498, + "step": 1258, + "task_loss": 0.5909712910652161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.26737551504355084, + "compression/movement_sparsity/importance_threshold": -0.0038704772420357445, + "compression/movement_sparsity/linear_layer_sparsity": 0.035666783237329476, + "compression/movement_sparsity/model_sparsity": 0.034441519509267986, + "compression_loss": 28.860977172851562, + "distillation_loss": 0.2632543742656708, + "epoch": 1.06, + "learning_rate": 4.4678782755705835e-05, + "loss": 29.2396, + "step": 1259, + "task_loss": 0.026146475225687027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27070693076545305, + "compression/movement_sparsity/importance_threshold": -0.003860016498400721, + "compression/movement_sparsity/linear_layer_sparsity": 0.0368348031539328, + "compression/movement_sparsity/model_sparsity": 0.035569414348486435, + "compression_loss": 29.220075607299805, + "distillation_loss": 0.6554915308952332, + "epoch": 1.07, + "learning_rate": 4.4674556213017755e-05, + "loss": 29.6711, + "step": 1260, + "task_loss": 0.5693647861480713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27403233852555536, + "compression/movement_sparsity/importance_threshold": -0.003849574619942968, + "compression/movement_sparsity/linear_layer_sparsity": 0.03797514707745329, + "compression/movement_sparsity/model_sparsity": 0.036670583950125756, + "compression_loss": 29.578508377075195, + "distillation_loss": 0.48726174235343933, + "epoch": 1.07, + "learning_rate": 4.4670329670329675e-05, + "loss": 30.0596, + "step": 1261, + "task_loss": 0.6845362186431885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27735174374620586, + "compression/movement_sparsity/importance_threshold": -0.003839151589636154, + "compression/movement_sparsity/linear_layer_sparsity": 0.039314290723798884, + "compression/movement_sparsity/model_sparsity": 0.03796372389253166, + "compression_loss": 29.93628692626953, + "distillation_loss": 0.4077973961830139, + "epoch": 1.07, + "learning_rate": 4.466610312764159e-05, + "loss": 30.511, + "step": 1262, + "task_loss": 0.5983346104621887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2806651518497526, + "compression/movement_sparsity/importance_threshold": -0.003828747390453945, + "compression/movement_sparsity/linear_layer_sparsity": 0.04042754293845069, + "compression/movement_sparsity/model_sparsity": 0.03903873246884578, + "compression_loss": 30.29340362548828, + "distillation_loss": 0.4096171259880066, + "epoch": 1.07, + "learning_rate": 4.466187658495351e-05, + "loss": 30.8892, + "step": 1263, + "task_loss": 0.16335059702396393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.28397256825854333, + "compression/movement_sparsity/importance_threshold": -0.0038183620053700096, + "compression/movement_sparsity/linear_layer_sparsity": 0.04175581174391238, + "compression/movement_sparsity/model_sparsity": 0.040321371154607065, + "compression_loss": 30.649873733520508, + "distillation_loss": 0.6780976057052612, + "epoch": 1.07, + "learning_rate": 4.4657650042265434e-05, + "loss": 31.2155, + "step": 1264, + "task_loss": 0.8164452314376831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2872739983949271, + "compression/movement_sparsity/importance_threshold": -0.003807995417358012, + "compression/movement_sparsity/linear_layer_sparsity": 0.04274527917433629, + "compression/movement_sparsity/model_sparsity": 0.04127684733483806, + "compression_loss": 31.00570297241211, + "distillation_loss": 0.3203093111515045, + "epoch": 1.07, + "learning_rate": 4.465342349957735e-05, + "loss": 31.419, + "step": 1265, + "task_loss": 0.5225319862365723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2905694476812517, + "compression/movement_sparsity/importance_threshold": -0.00379764760939162, + "compression/movement_sparsity/linear_layer_sparsity": 0.044160737493551414, + "compression/movement_sparsity/model_sparsity": 0.04264368030632903, + "compression_loss": 31.360864639282227, + "distillation_loss": 0.7801963686943054, + "epoch": 1.07, + "learning_rate": 4.4649196956889267e-05, + "loss": 32.0025, + "step": 1266, + "task_loss": 1.1612662076950073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.293858921539865, + "compression/movement_sparsity/importance_threshold": -0.003787318564444501, + "compression/movement_sparsity/linear_layer_sparsity": 0.04540467858549228, + "compression/movement_sparsity/model_sparsity": 0.04384488819495132, + "compression_loss": 31.71538734436035, + "distillation_loss": 0.44098103046417236, + "epoch": 1.07, + "learning_rate": 4.4644970414201186e-05, + "loss": 32.1975, + "step": 1267, + "task_loss": 1.3053979873657227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.29714242539311586, + "compression/movement_sparsity/importance_threshold": -0.003777008265490321, + "compression/movement_sparsity/linear_layer_sparsity": 0.04666427610953903, + "compression/movement_sparsity/model_sparsity": 0.045061214669071835, + "compression_loss": 32.06924819946289, + "distillation_loss": 0.350536584854126, + "epoch": 1.07, + "learning_rate": 4.46407438715131e-05, + "loss": 32.6281, + "step": 1268, + "task_loss": 0.8032504916191101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.30041996466335225, + "compression/movement_sparsity/importance_threshold": -0.003766716695502745, + "compression/movement_sparsity/linear_layer_sparsity": 0.04781054634437454, + "compression/movement_sparsity/model_sparsity": 0.04616810699500105, + "compression_loss": 32.42247772216797, + "distillation_loss": 0.459830641746521, + "epoch": 1.07, + "learning_rate": 4.4636517328825026e-05, + "loss": 32.8516, + "step": 1269, + "task_loss": 0.7863566279411316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3036915447729217, + "compression/movement_sparsity/importance_threshold": -0.003756443837455444, + "compression/movement_sparsity/linear_layer_sparsity": 0.04906270318781652, + "compression/movement_sparsity/model_sparsity": 0.04737724839878577, + "compression_loss": 32.775001525878906, + "distillation_loss": 0.6473171710968018, + "epoch": 1.07, + "learning_rate": 4.4632290786136946e-05, + "loss": 33.1638, + "step": 1270, + "task_loss": 0.4224321246147156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3069571711441735, + "compression/movement_sparsity/importance_threshold": -0.0037461896743220804, + "compression/movement_sparsity/linear_layer_sparsity": 0.050244245110518906, + "compression/movement_sparsity/model_sparsity": 0.04851820072159523, + "compression_loss": 33.126888275146484, + "distillation_loss": 0.2960829734802246, + "epoch": 1.07, + "learning_rate": 4.462806424344886e-05, + "loss": 33.4904, + "step": 1271, + "task_loss": 0.5079767107963562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31021684919945514, + "compression/movement_sparsity/importance_threshold": -0.003735954189076323, + "compression/movement_sparsity/linear_layer_sparsity": 0.051406565274992294, + "compression/movement_sparsity/model_sparsity": 0.04964059161270389, + "compression_loss": 33.47807312011719, + "distillation_loss": 0.36031287908554077, + "epoch": 1.08, + "learning_rate": 4.462383770076078e-05, + "loss": 33.8437, + "step": 1272, + "task_loss": 0.8334219455718994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31347058436111475, + "compression/movement_sparsity/importance_threshold": -0.003725737364691839, + "compression/movement_sparsity/linear_layer_sparsity": 0.052793012094349355, + "compression/movement_sparsity/model_sparsity": 0.05097940971860674, + "compression_loss": 33.82857131958008, + "distillation_loss": 0.38945573568344116, + "epoch": 1.08, + "learning_rate": 4.46196111580727e-05, + "loss": 34.2726, + "step": 1273, + "task_loss": 0.9228662252426147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31671838205150105, + "compression/movement_sparsity/importance_threshold": -0.0037155391841422938, + "compression/movement_sparsity/linear_layer_sparsity": 0.05416271738234567, + "compression/movement_sparsity/model_sparsity": 0.05230206141625405, + "compression_loss": 34.178428649902344, + "distillation_loss": 1.5171643495559692, + "epoch": 1.08, + "learning_rate": 4.461538461538462e-05, + "loss": 35.3024, + "step": 1274, + "task_loss": 1.7723408937454224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31996024769296194, + "compression/movement_sparsity/importance_threshold": -0.003705359630401354, + "compression/movement_sparsity/linear_layer_sparsity": 0.05539598634425243, + "compression/movement_sparsity/model_sparsity": 0.05349296379534023, + "compression_loss": 34.527652740478516, + "distillation_loss": 0.5163383483886719, + "epoch": 1.08, + "learning_rate": 4.461115807269654e-05, + "loss": 35.4852, + "step": 1275, + "task_loss": 0.6891390681266785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.32319618670784567, + "compression/movement_sparsity/importance_threshold": -0.003695198686442687, + "compression/movement_sparsity/linear_layer_sparsity": 0.05650888083387517, + "compression/movement_sparsity/model_sparsity": 0.05456762693558051, + "compression_loss": 34.87626266479492, + "distillation_loss": 1.208616018295288, + "epoch": 1.08, + "learning_rate": 4.460693153000846e-05, + "loss": 35.7273, + "step": 1276, + "task_loss": 0.8003085851669312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.32642620451849996, + "compression/movement_sparsity/importance_threshold": -0.003685056335239961, + "compression/movement_sparsity/linear_layer_sparsity": 0.05764483666370566, + "compression/movement_sparsity/model_sparsity": 0.05566455918804744, + "compression_loss": 35.22423553466797, + "distillation_loss": 0.6112419962882996, + "epoch": 1.08, + "learning_rate": 4.460270498732038e-05, + "loss": 35.9297, + "step": 1277, + "task_loss": 0.41883552074432373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3296503065472739, + "compression/movement_sparsity/importance_threshold": -0.00367493255976684, + "compression/movement_sparsity/linear_layer_sparsity": 0.05893800071964737, + "compression/movement_sparsity/model_sparsity": 0.056913299080429594, + "compression_loss": 35.57155227661133, + "distillation_loss": 0.7337345480918884, + "epoch": 1.08, + "learning_rate": 4.459847844463229e-05, + "loss": 36.1913, + "step": 1278, + "task_loss": 0.759214460849762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.33286849821651504, + "compression/movement_sparsity/importance_threshold": -0.003664827342996992, + "compression/movement_sparsity/linear_layer_sparsity": 0.06020794842120205, + "compression/movement_sparsity/model_sparsity": 0.058139620171619784, + "compression_loss": 35.918243408203125, + "distillation_loss": 0.5227680206298828, + "epoch": 1.08, + "learning_rate": 4.459425190194421e-05, + "loss": 36.4221, + "step": 1279, + "task_loss": 0.09014008939266205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.33608078494857196, + "compression/movement_sparsity/importance_threshold": -0.0036547406679040837, + "compression/movement_sparsity/linear_layer_sparsity": 0.06133783484970588, + "compression/movement_sparsity/model_sparsity": 0.059230691525367286, + "compression_loss": 36.26424026489258, + "distillation_loss": 0.6876983642578125, + "epoch": 1.08, + "learning_rate": 4.459002535925613e-05, + "loss": 36.829, + "step": 1280, + "task_loss": 1.2024675607681274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3392871721657921, + "compression/movement_sparsity/importance_threshold": -0.003644672517461783, + "compression/movement_sparsity/linear_layer_sparsity": 0.0625562224504031, + "compression/movement_sparsity/model_sparsity": 0.060407223763781874, + "compression_loss": 36.609622955322266, + "distillation_loss": 0.3284452259540558, + "epoch": 1.08, + "learning_rate": 4.458579881656805e-05, + "loss": 37.175, + "step": 1281, + "task_loss": 0.7041627764701843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3424876652905246, + "compression/movement_sparsity/importance_threshold": -0.003634622874643754, + "compression/movement_sparsity/linear_layer_sparsity": 0.06385704182196703, + "compression/movement_sparsity/model_sparsity": 0.061663355988144125, + "compression_loss": 36.95436096191406, + "distillation_loss": 0.5951570272445679, + "epoch": 1.08, + "learning_rate": 4.458157227387997e-05, + "loss": 37.4481, + "step": 1282, + "task_loss": 0.9332437515258789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3456822697451173, + "compression/movement_sparsity/importance_threshold": -0.003624591722423665, + "compression/movement_sparsity/linear_layer_sparsity": 0.06531987485919943, + "compression/movement_sparsity/model_sparsity": 0.06307593621034681, + "compression_loss": 37.29840850830078, + "distillation_loss": 0.6705347895622253, + "epoch": 1.08, + "learning_rate": 4.457734573119189e-05, + "loss": 37.9679, + "step": 1283, + "task_loss": 0.5198345184326172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.34887099095191787, + "compression/movement_sparsity/importance_threshold": -0.0036145790437751837, + "compression/movement_sparsity/linear_layer_sparsity": 0.06659893262482791, + "compression/movement_sparsity/model_sparsity": 0.06431105440688402, + "compression_loss": 37.64185333251953, + "distillation_loss": 0.5819041728973389, + "epoch": 1.09, + "learning_rate": 4.45731191885038e-05, + "loss": 38.0916, + "step": 1284, + "task_loss": 0.2840628921985626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3520538343332754, + "compression/movement_sparsity/importance_threshold": -0.0036045848216719745, + "compression/movement_sparsity/linear_layer_sparsity": 0.06791487184095411, + "compression/movement_sparsity/model_sparsity": 0.06558178706263375, + "compression_loss": 37.9846305847168, + "distillation_loss": 0.39920535683631897, + "epoch": 1.09, + "learning_rate": 4.456889264581572e-05, + "loss": 38.4294, + "step": 1285, + "task_loss": 0.8727322220802307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.35523080531153767, + "compression/movement_sparsity/importance_threshold": -0.0035946090390877056, + "compression/movement_sparsity/linear_layer_sparsity": 0.06926108651870778, + "compression/movement_sparsity/model_sparsity": 0.06688175512476581, + "compression_loss": 38.32673645019531, + "distillation_loss": 0.3835964500904083, + "epoch": 1.09, + "learning_rate": 4.456466610312765e-05, + "loss": 38.8715, + "step": 1286, + "task_loss": 0.44173380732536316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3584019093090525, + "compression/movement_sparsity/importance_threshold": -0.0035846516789960446, + "compression/movement_sparsity/linear_layer_sparsity": 0.07057676340314599, + "compression/movement_sparsity/model_sparsity": 0.06815223446072806, + "compression_loss": 38.6682243347168, + "distillation_loss": 0.29448437690734863, + "epoch": 1.09, + "learning_rate": 4.456043956043957e-05, + "loss": 39.3928, + "step": 1287, + "task_loss": 0.0277054812759161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3615671517481689, + "compression/movement_sparsity/importance_threshold": -0.0035747127243706553, + "compression/movement_sparsity/linear_layer_sparsity": 0.07190335090097102, + "compression/movement_sparsity/model_sparsity": 0.0694332495969423, + "compression_loss": 39.009124755859375, + "distillation_loss": 0.5504357218742371, + "epoch": 1.09, + "learning_rate": 4.455621301775148e-05, + "loss": 39.5186, + "step": 1288, + "task_loss": 1.0629488229751587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3647265380512345, + "compression/movement_sparsity/importance_threshold": -0.003564792158185207, + "compression/movement_sparsity/linear_layer_sparsity": 0.07328470610074758, + "compression/movement_sparsity/model_sparsity": 0.07076715099606089, + "compression_loss": 39.34932327270508, + "distillation_loss": 0.5402329564094543, + "epoch": 1.09, + "learning_rate": 4.45519864750634e-05, + "loss": 40.0552, + "step": 1289, + "task_loss": 0.2442416101694107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36788007364059727, + "compression/movement_sparsity/importance_threshold": -0.0035548899634133668, + "compression/movement_sparsity/linear_layer_sparsity": 0.07455445109145245, + "compression/movement_sparsity/model_sparsity": 0.07199327634014256, + "compression_loss": 39.68889236450195, + "distillation_loss": 0.3655256927013397, + "epoch": 1.09, + "learning_rate": 4.454775993237532e-05, + "loss": 40.4195, + "step": 1290, + "task_loss": 0.3499029278755188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.37102776393860626, + "compression/movement_sparsity/importance_threshold": -0.003545006123028798, + "compression/movement_sparsity/linear_layer_sparsity": 0.07616641969330659, + "compression/movement_sparsity/model_sparsity": 0.07354986886152756, + "compression_loss": 40.02782440185547, + "distillation_loss": 0.4868693947792053, + "epoch": 1.09, + "learning_rate": 4.454353338968724e-05, + "loss": 40.4886, + "step": 1291, + "task_loss": 0.7678642272949219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.37416961436760904, + "compression/movement_sparsity/importance_threshold": -0.0035351406200051705, + "compression/movement_sparsity/linear_layer_sparsity": 0.07752653795052367, + "compression/movement_sparsity/model_sparsity": 0.07486326287239606, + "compression_loss": 40.3660774230957, + "distillation_loss": 1.2233290672302246, + "epoch": 1.09, + "learning_rate": 4.453930684699916e-05, + "loss": 41.1705, + "step": 1292, + "task_loss": 1.277454137802124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.377305630349954, + "compression/movement_sparsity/importance_threshold": -0.003525293437316149, + "compression/movement_sparsity/linear_layer_sparsity": 0.07884109396320411, + "compression/movement_sparsity/model_sparsity": 0.07613265984199362, + "compression_loss": 40.7037239074707, + "distillation_loss": 0.6494832634925842, + "epoch": 1.09, + "learning_rate": 4.453508030431108e-05, + "loss": 41.3183, + "step": 1293, + "task_loss": 0.46674782037734985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.38043581730798876, + "compression/movement_sparsity/importance_threshold": -0.003515464557935404, + "compression/movement_sparsity/linear_layer_sparsity": 0.08025448940141823, + "compression/movement_sparsity/model_sparsity": 0.07749750079879214, + "compression_loss": 41.040740966796875, + "distillation_loss": 0.545191764831543, + "epoch": 1.09, + "learning_rate": 4.453085376162299e-05, + "loss": 41.6571, + "step": 1294, + "task_loss": 0.5185684561729431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3835601806640625, + "compression/movement_sparsity/importance_threshold": -0.003505653964836597, + "compression/movement_sparsity/linear_layer_sparsity": 0.08188674101242098, + "compression/movement_sparsity/model_sparsity": 0.07907367954556364, + "compression_loss": 41.37712097167969, + "distillation_loss": 0.5012102723121643, + "epoch": 1.09, + "learning_rate": 4.452662721893491e-05, + "loss": 41.9171, + "step": 1295, + "task_loss": 0.6159313917160034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3866787258405228, + "compression/movement_sparsity/importance_threshold": -0.003495861640993398, + "compression/movement_sparsity/linear_layer_sparsity": 0.08340428213076671, + "compression/movement_sparsity/model_sparsity": 0.08053908845799168, + "compression_loss": 41.712806701660156, + "distillation_loss": 0.8001213073730469, + "epoch": 1.1, + "learning_rate": 4.452240067624683e-05, + "loss": 42.3828, + "step": 1296, + "task_loss": 0.43087005615234375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3897914582597178, + "compression/movement_sparsity/importance_threshold": -0.003486087569379474, + "compression/movement_sparsity/linear_layer_sparsity": 0.0848317957834617, + "compression/movement_sparsity/model_sparsity": 0.08191756262517091, + "compression_loss": 42.04793167114258, + "distillation_loss": 0.5749796628952026, + "epoch": 1.1, + "learning_rate": 4.451817413355875e-05, + "loss": 42.4919, + "step": 1297, + "task_loss": 0.44719234108924866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3928983833439961, + "compression/movement_sparsity/importance_threshold": -0.003476331732968489, + "compression/movement_sparsity/linear_layer_sparsity": 0.086385765233935, + "compression/movement_sparsity/model_sparsity": 0.08341814844445126, + "compression_loss": 42.382362365722656, + "distillation_loss": 0.5764374136924744, + "epoch": 1.1, + "learning_rate": 4.451394759087067e-05, + "loss": 42.7172, + "step": 1298, + "task_loss": 0.30917418003082275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.39599950651570576, + "compression/movement_sparsity/importance_threshold": -0.003466594114734112, + "compression/movement_sparsity/linear_layer_sparsity": 0.08800423250715068, + "compression/movement_sparsity/model_sparsity": 0.08498101638784429, + "compression_loss": 42.71617889404297, + "distillation_loss": 1.1756658554077148, + "epoch": 1.1, + "learning_rate": 4.450972104818259e-05, + "loss": 43.5296, + "step": 1299, + "task_loss": 0.970940887928009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3990948331971949, + "compression/movement_sparsity/importance_threshold": -0.0034568746976500085, + "compression/movement_sparsity/linear_layer_sparsity": 0.08962884072669884, + "compression/movement_sparsity/model_sparsity": 0.0865498143171715, + "compression_loss": 43.049354553222656, + "distillation_loss": 0.3907695412635803, + "epoch": 1.1, + "learning_rate": 4.4505494505494504e-05, + "loss": 43.5209, + "step": 1300, + "task_loss": 0.3283112943172455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.40218436881081115, + "compression/movement_sparsity/importance_threshold": -0.003447173464689848, + "compression/movement_sparsity/linear_layer_sparsity": 0.09130783507483417, + "compression/movement_sparsity/model_sparsity": 0.08817113004425761, + "compression_loss": 43.38188552856445, + "distillation_loss": 0.5744103193283081, + "epoch": 1.1, + "learning_rate": 4.4501267962806424e-05, + "loss": 43.887, + "step": 1301, + "task_loss": 0.4608490765094757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.40526811877890373, + "compression/movement_sparsity/importance_threshold": -0.003437490398827293, + "compression/movement_sparsity/linear_layer_sparsity": 0.09294863631403183, + "compression/movement_sparsity/model_sparsity": 0.0897555647131938, + "compression_loss": 43.713741302490234, + "distillation_loss": 0.7757353782653809, + "epoch": 1.1, + "learning_rate": 4.449704142011834e-05, + "loss": 44.3188, + "step": 1302, + "task_loss": 1.9043248891830444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.40834608852382054, + "compression/movement_sparsity/importance_threshold": -0.003427825483036012, + "compression/movement_sparsity/linear_layer_sparsity": 0.09464021065902299, + "compression/movement_sparsity/model_sparsity": 0.09138902827554317, + "compression_loss": 44.044921875, + "distillation_loss": 0.5997978448867798, + "epoch": 1.1, + "learning_rate": 4.449281487743027e-05, + "loss": 44.5606, + "step": 1303, + "task_loss": 0.23925459384918213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.411418283467909, + "compression/movement_sparsity/importance_threshold": -0.0034181787002896744, + "compression/movement_sparsity/linear_layer_sparsity": 0.09646436982395731, + "compression/movement_sparsity/model_sparsity": 0.09315052196139202, + "compression_loss": 44.37547302246094, + "distillation_loss": 0.5133511424064636, + "epoch": 1.1, + "learning_rate": 4.448858833474218e-05, + "loss": 44.9213, + "step": 1304, + "task_loss": 1.0616811513900757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4144847090335183, + "compression/movement_sparsity/importance_threshold": -0.0034085500335619425, + "compression/movement_sparsity/linear_layer_sparsity": 0.09818016215341692, + "compression/movement_sparsity/model_sparsity": 0.09480737154594007, + "compression_loss": 44.70539093017578, + "distillation_loss": 0.34925776720046997, + "epoch": 1.1, + "learning_rate": 4.44843617920541e-05, + "loss": 45.2259, + "step": 1305, + "task_loss": 0.8480327725410461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4175453706429964, + "compression/movement_sparsity/importance_threshold": -0.003398939465826485, + "compression/movement_sparsity/linear_layer_sparsity": 0.09994365115341997, + "compression/movement_sparsity/model_sparsity": 0.0965102792736663, + "compression_loss": 45.034645080566406, + "distillation_loss": 0.27559226751327515, + "epoch": 1.1, + "learning_rate": 4.448013524936602e-05, + "loss": 45.3684, + "step": 1306, + "task_loss": 0.6624144911766052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4206002737186907, + "compression/movement_sparsity/importance_threshold": -0.0033893469800569706, + "compression/movement_sparsity/linear_layer_sparsity": 0.10182999485257531, + "compression/movement_sparsity/model_sparsity": 0.09833182126368369, + "compression_loss": 45.36323928833008, + "distillation_loss": 0.6117426156997681, + "epoch": 1.1, + "learning_rate": 4.4475908706677935e-05, + "loss": 46.0391, + "step": 1307, + "task_loss": 1.6863586902618408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.42364942368295055, + "compression/movement_sparsity/importance_threshold": -0.0033797725592270623, + "compression/movement_sparsity/linear_layer_sparsity": 0.10368620618011484, + "compression/movement_sparsity/model_sparsity": 0.10012426602174826, + "compression_loss": 45.691219329833984, + "distillation_loss": 0.3198488652706146, + "epoch": 1.11, + "learning_rate": 4.447168216398986e-05, + "loss": 46.1917, + "step": 1308, + "task_loss": 0.4496100842952728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.42669282595812374, + "compression/movement_sparsity/importance_threshold": -0.0033702161863104286, + "compression/movement_sparsity/linear_layer_sparsity": 0.10527419528085326, + "compression/movement_sparsity/model_sparsity": 0.10165770281165044, + "compression_loss": 46.01858139038086, + "distillation_loss": 0.38542452454566956, + "epoch": 1.11, + "learning_rate": 4.446745562130178e-05, + "loss": 46.554, + "step": 1309, + "task_loss": 0.3755490481853485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.42973048596655783, + "compression/movement_sparsity/importance_threshold": -0.0033606778442807376, + "compression/movement_sparsity/linear_layer_sparsity": 0.10724181410661465, + "compression/movement_sparsity/model_sparsity": 0.1035577278776434, + "compression_loss": 46.345279693603516, + "distillation_loss": 0.6864478588104248, + "epoch": 1.11, + "learning_rate": 4.4463229078613694e-05, + "loss": 46.9479, + "step": 1310, + "task_loss": 0.6452550888061523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4327624091306019, + "compression/movement_sparsity/importance_threshold": -0.0033511575161116534, + "compression/movement_sparsity/linear_layer_sparsity": 0.10911438539215058, + "compression/movement_sparsity/model_sparsity": 0.1053659705788181, + "compression_loss": 46.671348571777344, + "distillation_loss": 0.39784860610961914, + "epoch": 1.11, + "learning_rate": 4.4459002535925614e-05, + "loss": 47.2376, + "step": 1311, + "task_loss": 0.6274633407592773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4357886008726038, + "compression/movement_sparsity/importance_threshold": -0.003341655184776844, + "compression/movement_sparsity/linear_layer_sparsity": 0.11101962889700877, + "compression/movement_sparsity/model_sparsity": 0.10720576310806983, + "compression_loss": 46.99675750732422, + "distillation_loss": 0.4980035126209259, + "epoch": 1.11, + "learning_rate": 4.4454775993237534e-05, + "loss": 47.4028, + "step": 1312, + "task_loss": 0.6609987616539001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4388090666149118, + "compression/movement_sparsity/importance_threshold": -0.0033321708332499757, + "compression/movement_sparsity/linear_layer_sparsity": 0.11290673574289281, + "compression/movement_sparsity/model_sparsity": 0.10902804202837806, + "compression_loss": 47.32157516479492, + "distillation_loss": 0.6137720346450806, + "epoch": 1.11, + "learning_rate": 4.445054945054945e-05, + "loss": 47.7557, + "step": 1313, + "task_loss": 1.381227731704712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4418238117798735, + "compression/movement_sparsity/importance_threshold": -0.003322704444504718, + "compression/movement_sparsity/linear_layer_sparsity": 0.1147247658856623, + "compression/movement_sparsity/model_sparsity": 0.11078361724282851, + "compression_loss": 47.64578628540039, + "distillation_loss": 0.7195747494697571, + "epoch": 1.11, + "learning_rate": 4.4446322907861373e-05, + "loss": 48.2647, + "step": 1314, + "task_loss": 0.5013588666915894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.44483284178983795, + "compression/movement_sparsity/importance_threshold": -0.0033132560015147333, + "compression/movement_sparsity/linear_layer_sparsity": 0.11677177581954327, + "compression/movement_sparsity/model_sparsity": 0.11276030608814154, + "compression_loss": 47.9693489074707, + "distillation_loss": 0.4741535484790802, + "epoch": 1.11, + "learning_rate": 4.444209636517329e-05, + "loss": 48.4193, + "step": 1315, + "task_loss": 1.341664433479309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.44783616206715304, + "compression/movement_sparsity/importance_threshold": -0.0033038254872536903, + "compression/movement_sparsity/linear_layer_sparsity": 0.11879701222332115, + "compression/movement_sparsity/model_sparsity": 0.11471596939109373, + "compression_loss": 48.292205810546875, + "distillation_loss": 0.4550090730190277, + "epoch": 1.11, + "learning_rate": 4.443786982248521e-05, + "loss": 48.7506, + "step": 1316, + "task_loss": 0.25337034463882446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.450833778034167, + "compression/movement_sparsity/importance_threshold": -0.0032944128846952555, + "compression/movement_sparsity/linear_layer_sparsity": 0.12061949008061883, + "compression/movement_sparsity/model_sparsity": 0.11647583952739556, + "compression_loss": 48.61443328857422, + "distillation_loss": 0.5711060762405396, + "epoch": 1.11, + "learning_rate": 4.4433643279797126e-05, + "loss": 49.2495, + "step": 1317, + "task_loss": 0.4683636724948883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.45382569511322746, + "compression/movement_sparsity/importance_threshold": -0.003285018176813097, + "compression/movement_sparsity/linear_layer_sparsity": 0.1226937348133801, + "compression/movement_sparsity/model_sparsity": 0.11847882757246331, + "compression_loss": 48.9360466003418, + "distillation_loss": 0.9420948624610901, + "epoch": 1.11, + "learning_rate": 4.4429416737109046e-05, + "loss": 49.7345, + "step": 1318, + "task_loss": 0.6032072305679321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4568119187266834, + "compression/movement_sparsity/importance_threshold": -0.0032756413465808796, + "compression/movement_sparsity/linear_layer_sparsity": 0.12462016756412722, + "compression/movement_sparsity/model_sparsity": 0.12033908143182193, + "compression_loss": 49.2570686340332, + "distillation_loss": 0.5850843191146851, + "epoch": 1.11, + "learning_rate": 4.4425190194420965e-05, + "loss": 49.8058, + "step": 1319, + "task_loss": 0.4643186330795288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4597924542968828, + "compression/movement_sparsity/importance_threshold": -0.003266282376972271, + "compression/movement_sparsity/linear_layer_sparsity": 0.12671122537325508, + "compression/movement_sparsity/model_sparsity": 0.12235830497235999, + "compression_loss": 49.57742691040039, + "distillation_loss": 0.6551699638366699, + "epoch": 1.12, + "learning_rate": 4.4420963651732885e-05, + "loss": 50.3427, + "step": 1320, + "task_loss": 1.0393555164337158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.46276730724617354, + "compression/movement_sparsity/importance_threshold": -0.003256941250960938, + "compression/movement_sparsity/linear_layer_sparsity": 0.12876782233791526, + "compression/movement_sparsity/model_sparsity": 0.12434425150445183, + "compression_loss": 49.89724349975586, + "distillation_loss": 0.5629104971885681, + "epoch": 1.12, + "learning_rate": 4.4416737109044805e-05, + "loss": 50.5745, + "step": 1321, + "task_loss": 0.513153612613678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4657364829969042, + "compression/movement_sparsity/importance_threshold": -0.003247617951520546, + "compression/movement_sparsity/linear_layer_sparsity": 0.1308728075748418, + "compression/movement_sparsity/model_sparsity": 0.1263769240227979, + "compression_loss": 50.21639633178711, + "distillation_loss": 0.5485292673110962, + "epoch": 1.12, + "learning_rate": 4.4412510566356725e-05, + "loss": 50.9856, + "step": 1322, + "task_loss": 1.4749886989593506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.46869998697142295, + "compression/movement_sparsity/importance_threshold": -0.003238312461624763, + "compression/movement_sparsity/linear_layer_sparsity": 0.13266208854944123, + "compression/movement_sparsity/model_sparsity": 0.12810473769144773, + "compression_loss": 50.534996032714844, + "distillation_loss": 0.868882417678833, + "epoch": 1.12, + "learning_rate": 4.440828402366864e-05, + "loss": 51.2033, + "step": 1323, + "task_loss": 0.9010410904884338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4716578245920773, + "compression/movement_sparsity/importance_threshold": -0.003229024764247257, + "compression/movement_sparsity/linear_layer_sparsity": 0.13473499777542727, + "compression/movement_sparsity/model_sparsity": 0.1301064361085065, + "compression_loss": 50.852928161621094, + "distillation_loss": 0.30092743039131165, + "epoch": 1.12, + "learning_rate": 4.440405748098056e-05, + "loss": 51.2459, + "step": 1324, + "task_loss": 0.7190616726875305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.47461000128121655, + "compression/movement_sparsity/importance_threshold": -0.003219754842361691, + "compression/movement_sparsity/linear_layer_sparsity": 0.13662842443015832, + "compression/movement_sparsity/model_sparsity": 0.13193481773278584, + "compression_loss": 51.170188903808594, + "distillation_loss": 0.9655022621154785, + "epoch": 1.12, + "learning_rate": 4.4399830938292484e-05, + "loss": 51.8331, + "step": 1325, + "task_loss": 0.8328261375427246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4775565224611882, + "compression/movement_sparsity/importance_threshold": -0.003210502678941735, + "compression/movement_sparsity/linear_layer_sparsity": 0.13875555284638466, + "compression/movement_sparsity/model_sparsity": 0.13398887274410237, + "compression_loss": 51.48688507080078, + "distillation_loss": 0.5974711775779724, + "epoch": 1.12, + "learning_rate": 4.43956043956044e-05, + "loss": 52.1163, + "step": 1326, + "task_loss": 0.6383416056632996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.48049739355434024, + "compression/movement_sparsity/importance_threshold": -0.003201268256961055, + "compression/movement_sparsity/linear_layer_sparsity": 0.14075847913251585, + "compression/movement_sparsity/model_sparsity": 0.13592299235058297, + "compression_loss": 51.80292892456055, + "distillation_loss": 1.082674503326416, + "epoch": 1.12, + "learning_rate": 4.4391377852916316e-05, + "loss": 52.5813, + "step": 1327, + "task_loss": 0.6380428075790405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4834326199830216, + "compression/movement_sparsity/importance_threshold": -0.003192051559393317, + "compression/movement_sparsity/linear_layer_sparsity": 0.14276274092542224, + "compression/movement_sparsity/model_sparsity": 0.13785840158507257, + "compression_loss": 52.11836624145508, + "distillation_loss": 0.5320615172386169, + "epoch": 1.12, + "learning_rate": 4.4387151310228236e-05, + "loss": 52.6443, + "step": 1328, + "task_loss": 0.32004961371421814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.48636220716958023, + "compression/movement_sparsity/importance_threshold": -0.003182852569212187, + "compression/movement_sparsity/linear_layer_sparsity": 0.14468940023535445, + "compression/movement_sparsity/model_sparsity": 0.1397188742206113, + "compression_loss": 52.43320083618164, + "distillation_loss": 0.8700262904167175, + "epoch": 1.12, + "learning_rate": 4.438292476754015e-05, + "loss": 53.2182, + "step": 1329, + "task_loss": 0.2888091504573822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.489286160536364, + "compression/movement_sparsity/importance_threshold": -0.0031736712693913333, + "compression/movement_sparsity/linear_layer_sparsity": 0.14658221875753608, + "compression/movement_sparsity/model_sparsity": 0.1415466686035651, + "compression_loss": 52.74740219116211, + "distillation_loss": 0.33761823177337646, + "epoch": 1.12, + "learning_rate": 4.437869822485207e-05, + "loss": 53.3991, + "step": 1330, + "task_loss": 0.8333884477615356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.49220448550572093, + "compression/movement_sparsity/importance_threshold": -0.003164507642904423, + "compression/movement_sparsity/linear_layer_sparsity": 0.14862351701511947, + "compression/movement_sparsity/model_sparsity": 0.14351784198623255, + "compression_loss": 53.060997009277344, + "distillation_loss": 0.7438492178916931, + "epoch": 1.13, + "learning_rate": 4.4374471682163995e-05, + "loss": 53.6375, + "step": 1331, + "task_loss": 1.1630669832229614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4951171875, + "compression/movement_sparsity/importance_threshold": -0.003155361672725121, + "compression/movement_sparsity/linear_layer_sparsity": 0.15089827855084542, + "compression/movement_sparsity/model_sparsity": 0.1457144584652213, + "compression_loss": 53.37394714355469, + "distillation_loss": 0.7777900695800781, + "epoch": 1.13, + "learning_rate": 4.4370245139475915e-05, + "loss": 54.312, + "step": 1332, + "task_loss": 1.1477227210998535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4980242719415489, + "compression/movement_sparsity/importance_threshold": -0.0031462333418270953, + "compression/movement_sparsity/linear_layer_sparsity": 0.1530567436796188, + "compression/movement_sparsity/model_sparsity": 0.1477987736766059, + "compression_loss": 53.68632507324219, + "distillation_loss": 0.3847755789756775, + "epoch": 1.13, + "learning_rate": 4.436601859678783e-05, + "loss": 54.2379, + "step": 1333, + "task_loss": 1.5904806852340698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5009257442527161, + "compression/movement_sparsity/importance_threshold": -0.0031371226331840113, + "compression/movement_sparsity/linear_layer_sparsity": 0.15496221374366206, + "compression/movement_sparsity/model_sparsity": 0.14963878498203773, + "compression_loss": 53.998050689697266, + "distillation_loss": 0.34787118434906006, + "epoch": 1.13, + "learning_rate": 4.436179205409975e-05, + "loss": 54.5374, + "step": 1334, + "task_loss": 0.7878853678703308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5038216098558489, + "compression/movement_sparsity/importance_threshold": -0.003128029529769539, + "compression/movement_sparsity/linear_layer_sparsity": 0.1571680178179498, + "compression/movement_sparsity/model_sparsity": 0.15176881290052666, + "compression_loss": 54.309173583984375, + "distillation_loss": 0.9435323476791382, + "epoch": 1.13, + "learning_rate": 4.435756551141167e-05, + "loss": 55.0187, + "step": 1335, + "task_loss": 0.3080785572528839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5067118741732968, + "compression/movement_sparsity/importance_threshold": -0.0031189540145573407, + "compression/movement_sparsity/linear_layer_sparsity": 0.15907311823279638, + "compression/movement_sparsity/model_sparsity": 0.15360846725534885, + "compression_loss": 54.619693756103516, + "distillation_loss": 0.4913204610347748, + "epoch": 1.13, + "learning_rate": 4.435333896872359e-05, + "loss": 55.2671, + "step": 1336, + "task_loss": 0.5756125450134277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5095965426274073, + "compression/movement_sparsity/importance_threshold": -0.003109896070521086, + "compression/movement_sparsity/linear_layer_sparsity": 0.1613855005174135, + "compression/movement_sparsity/model_sparsity": 0.15584141209476937, + "compression_loss": 54.929561614990234, + "distillation_loss": 0.6135482788085938, + "epoch": 1.13, + "learning_rate": 4.434911242603551e-05, + "loss": 55.6303, + "step": 1337, + "task_loss": 0.9360067248344421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5124756206405283, + "compression/movement_sparsity/importance_threshold": -0.0031008556806344416, + "compression/movement_sparsity/linear_layer_sparsity": 0.16348936488258226, + "compression/movement_sparsity/model_sparsity": 0.15787300224675077, + "compression_loss": 55.23884582519531, + "distillation_loss": 0.6394940614700317, + "epoch": 1.13, + "learning_rate": 4.434488588334743e-05, + "loss": 55.84, + "step": 1338, + "task_loss": 1.604577660560608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5153491136350088, + "compression/movement_sparsity/importance_threshold": -0.0030918328278710726, + "compression/movement_sparsity/linear_layer_sparsity": 0.1656997239889069, + "compression/movement_sparsity/model_sparsity": 0.1600074287179132, + "compression_loss": 55.54752731323242, + "distillation_loss": 1.371919870376587, + "epoch": 1.13, + "learning_rate": 4.434065934065934e-05, + "loss": 56.3289, + "step": 1339, + "task_loss": 1.1545560359954834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5182170270331966, + "compression/movement_sparsity/importance_threshold": -0.003082827495204646, + "compression/movement_sparsity/linear_layer_sparsity": 0.16785873762939152, + "compression/movement_sparsity/model_sparsity": 0.16209227359794434, + "compression_loss": 55.855594635009766, + "distillation_loss": 0.6714638471603394, + "epoch": 1.13, + "learning_rate": 4.433643279797126e-05, + "loss": 56.4418, + "step": 1340, + "task_loss": 1.5307416915893555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5210793662574396, + "compression/movement_sparsity/importance_threshold": -0.0030738396656088307, + "compression/movement_sparsity/linear_layer_sparsity": 0.17007337751189747, + "compression/movement_sparsity/model_sparsity": 0.16423083378745704, + "compression_loss": 56.163063049316406, + "distillation_loss": 0.45750027894973755, + "epoch": 1.13, + "learning_rate": 4.433220625528318e-05, + "loss": 56.8211, + "step": 1341, + "task_loss": 0.32405903935432434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5239361367300863, + "compression/movement_sparsity/importance_threshold": -0.0030648693220572906, + "compression/movement_sparsity/linear_layer_sparsity": 0.1722077915945493, + "compression/movement_sparsity/model_sparsity": 0.16629192418014405, + "compression_loss": 56.469905853271484, + "distillation_loss": 0.2561332583427429, + "epoch": 1.13, + "learning_rate": 4.43279797125951e-05, + "loss": 57.3984, + "step": 1342, + "task_loss": 0.03715972602367401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5267873438734849, + "compression/movement_sparsity/importance_threshold": -0.003055916447523694, + "compression/movement_sparsity/linear_layer_sparsity": 0.1742794487829336, + "compression/movement_sparsity/model_sparsity": 0.16829241357094438, + "compression_loss": 56.77617263793945, + "distillation_loss": 0.5091273784637451, + "epoch": 1.14, + "learning_rate": 4.432375316990702e-05, + "loss": 57.3105, + "step": 1343, + "task_loss": 0.34252116084098816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5296329931099835, + "compression/movement_sparsity/importance_threshold": -0.003046981024981708, + "compression/movement_sparsity/linear_layer_sparsity": 0.17643600604488524, + "compression/movement_sparsity/model_sparsity": 0.17037488645660184, + "compression_loss": 57.081825256347656, + "distillation_loss": 0.5197078585624695, + "epoch": 1.14, + "learning_rate": 4.431952662721894e-05, + "loss": 57.726, + "step": 1344, + "task_loss": 0.37829092144966125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5324730898619304, + "compression/movement_sparsity/importance_threshold": -0.003038063037404998, + "compression/movement_sparsity/linear_layer_sparsity": 0.17847503871061782, + "compression/movement_sparsity/model_sparsity": 0.17234387207746832, + "compression_loss": 57.38687515258789, + "distillation_loss": 0.37463319301605225, + "epoch": 1.14, + "learning_rate": 4.431530008453086e-05, + "loss": 57.8522, + "step": 1345, + "task_loss": 1.0771381855010986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5353076395516739, + "compression/movement_sparsity/importance_threshold": -0.003029162467767231, + "compression/movement_sparsity/linear_layer_sparsity": 0.18065012612907558, + "compression/movement_sparsity/model_sparsity": 0.1744442385517505, + "compression_loss": 57.6912841796875, + "distillation_loss": 0.6838735938072205, + "epoch": 1.14, + "learning_rate": 4.431107354184277e-05, + "loss": 58.6726, + "step": 1346, + "task_loss": 0.6715477108955383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5381366476015617, + "compression/movement_sparsity/importance_threshold": -0.0030202792990420752, + "compression/movement_sparsity/linear_layer_sparsity": 0.18289939379439607, + "compression/movement_sparsity/model_sparsity": 0.17661623695321052, + "compression_loss": 57.995113372802734, + "distillation_loss": 0.5443418622016907, + "epoch": 1.14, + "learning_rate": 4.430684699915469e-05, + "loss": 58.6149, + "step": 1347, + "task_loss": 0.4529528021812439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5409601194339426, + "compression/movement_sparsity/importance_threshold": -0.003011413514203195, + "compression/movement_sparsity/linear_layer_sparsity": 0.18485191662393044, + "compression/movement_sparsity/model_sparsity": 0.17850168461688762, + "compression_loss": 58.298362731933594, + "distillation_loss": 0.3564160466194153, + "epoch": 1.14, + "learning_rate": 4.430262045646662e-05, + "loss": 58.8935, + "step": 1348, + "task_loss": 0.7282147407531738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5437780604711646, + "compression/movement_sparsity/importance_threshold": -0.003002565096224258, + "compression/movement_sparsity/linear_layer_sparsity": 0.18723356639834426, + "compression/movement_sparsity/model_sparsity": 0.18080151739473863, + "compression_loss": 58.60100173950195, + "distillation_loss": 0.4090028703212738, + "epoch": 1.14, + "learning_rate": 4.429839391377853e-05, + "loss": 59.1323, + "step": 1349, + "task_loss": 0.05018392950296402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5465904761355758, + "compression/movement_sparsity/importance_threshold": -0.0029937340280789316, + "compression/movement_sparsity/linear_layer_sparsity": 0.1895519107667793, + "compression/movement_sparsity/model_sparsity": 0.18304021950205643, + "compression_loss": 58.90302658081055, + "distillation_loss": 0.6254972815513611, + "epoch": 1.14, + "learning_rate": 4.429416737109045e-05, + "loss": 59.6259, + "step": 1350, + "task_loss": 1.0108393430709839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.549397371849524, + "compression/movement_sparsity/importance_threshold": -0.002984920292740883, + "compression/movement_sparsity/linear_layer_sparsity": 0.19198836401564756, + "compression/movement_sparsity/model_sparsity": 0.18539297308641917, + "compression_loss": 59.204490661621094, + "distillation_loss": 0.38609322905540466, + "epoch": 1.14, + "learning_rate": 4.428994082840237e-05, + "loss": 59.99, + "step": 1351, + "task_loss": 0.8283719420433044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5521987530353583, + "compression/movement_sparsity/importance_threshold": -0.0029761238731837772, + "compression/movement_sparsity/linear_layer_sparsity": 0.19427081663949863, + "compression/movement_sparsity/model_sparsity": 0.18759701644099538, + "compression_loss": 59.505332946777344, + "distillation_loss": 0.6423802375793457, + "epoch": 1.14, + "learning_rate": 4.428571428571428e-05, + "loss": 60.2511, + "step": 1352, + "task_loss": 0.8454609513282776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5549946251154263, + "compression/movement_sparsity/importance_threshold": -0.002967344752381282, + "compression/movement_sparsity/linear_layer_sparsity": 0.19643606661965682, + "compression/movement_sparsity/model_sparsity": 0.18968788342324708, + "compression_loss": 59.805545806884766, + "distillation_loss": 0.7758719325065613, + "epoch": 1.14, + "learning_rate": 4.428148774302621e-05, + "loss": 60.5515, + "step": 1353, + "task_loss": 0.945686936378479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5577849935120762, + "compression/movement_sparsity/importance_threshold": -0.002958582913307064, + "compression/movement_sparsity/linear_layer_sparsity": 0.19867671311177657, + "compression/movement_sparsity/model_sparsity": 0.19185155681532765, + "compression_loss": 60.10515594482422, + "distillation_loss": 0.69838947057724, + "epoch": 1.14, + "learning_rate": 4.427726120033813e-05, + "loss": 60.8867, + "step": 1354, + "task_loss": 0.34210434556007385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5605698636476562, + "compression/movement_sparsity/importance_threshold": -0.0029498383389347906, + "compression/movement_sparsity/linear_layer_sparsity": 0.2010280755003953, + "compression/movement_sparsity/model_sparsity": 0.19412214267226052, + "compression_loss": 60.40419387817383, + "distillation_loss": 1.1966954469680786, + "epoch": 1.15, + "learning_rate": 4.427303465765004e-05, + "loss": 61.2657, + "step": 1355, + "task_loss": 0.6600598096847534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5633492409445147, + "compression/movement_sparsity/importance_threshold": -0.002941111012238127, + "compression/movement_sparsity/linear_layer_sparsity": 0.2033781262305741, + "compression/movement_sparsity/model_sparsity": 0.19639146193025603, + "compression_loss": 60.70262908935547, + "distillation_loss": 0.7910445928573608, + "epoch": 1.15, + "learning_rate": 4.426880811496196e-05, + "loss": 61.6675, + "step": 1356, + "task_loss": 0.6780710220336914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5661231308249999, + "compression/movement_sparsity/importance_threshold": -0.00293240091619074, + "compression/movement_sparsity/linear_layer_sparsity": 0.20531577962306657, + "compression/movement_sparsity/model_sparsity": 0.1982625509677973, + "compression_loss": 61.00050354003906, + "distillation_loss": 0.5539929866790771, + "epoch": 1.15, + "learning_rate": 4.426458157227388e-05, + "loss": 61.6449, + "step": 1357, + "task_loss": 0.31943202018737793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5688915387114595, + "compression/movement_sparsity/importance_threshold": -0.0029237080337662985, + "compression/movement_sparsity/linear_layer_sparsity": 0.20764713325839232, + "compression/movement_sparsity/model_sparsity": 0.20051381543366695, + "compression_loss": 61.297794342041016, + "distillation_loss": 0.8837694525718689, + "epoch": 1.15, + "learning_rate": 4.42603550295858e-05, + "loss": 62.0661, + "step": 1358, + "task_loss": 1.372668743133545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5716544700262425, + "compression/movement_sparsity/importance_threshold": -0.002915032347938466, + "compression/movement_sparsity/linear_layer_sparsity": 0.20992387420594583, + "compression/movement_sparsity/model_sparsity": 0.2027123433255976, + "compression_loss": 61.594505310058594, + "distillation_loss": 0.8430509567260742, + "epoch": 1.15, + "learning_rate": 4.425612848689772e-05, + "loss": 62.337, + "step": 1359, + "task_loss": 0.5275387167930603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5744119301916966, + "compression/movement_sparsity/importance_threshold": -0.002906373841680912, + "compression/movement_sparsity/linear_layer_sparsity": 0.21205207579725938, + "compression/movement_sparsity/model_sparsity": 0.20476743464513567, + "compression_loss": 61.890655517578125, + "distillation_loss": 0.8673979640007019, + "epoch": 1.15, + "learning_rate": 4.425190194420964e-05, + "loss": 62.8152, + "step": 1360, + "task_loss": 0.978115975856781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5771639246301696, + "compression/movement_sparsity/importance_threshold": -0.002897732497967303, + "compression/movement_sparsity/linear_layer_sparsity": 0.21415770493923825, + "compression/movement_sparsity/model_sparsity": 0.20680072894841464, + "compression_loss": 62.18621063232422, + "distillation_loss": 0.9977774620056152, + "epoch": 1.15, + "learning_rate": 4.424767540152156e-05, + "loss": 63.0701, + "step": 1361, + "task_loss": 1.0909597873687744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5799104587640107, + "compression/movement_sparsity/importance_threshold": -0.002889108299771303, + "compression/movement_sparsity/linear_layer_sparsity": 0.21637436000607466, + "compression/movement_sparsity/model_sparsity": 0.2089412350944766, + "compression_loss": 62.481117248535156, + "distillation_loss": 0.8493362665176392, + "epoch": 1.15, + "learning_rate": 4.4243448858833473e-05, + "loss": 63.2991, + "step": 1362, + "task_loss": 0.7382258772850037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5826515380155675, + "compression/movement_sparsity/importance_threshold": -0.0028805012300665814, + "compression/movement_sparsity/linear_layer_sparsity": 0.21853450644248468, + "compression/movement_sparsity/model_sparsity": 0.21102717385540823, + "compression_loss": 62.77546691894531, + "distillation_loss": 1.1857895851135254, + "epoch": 1.15, + "learning_rate": 4.423922231614539e-05, + "loss": 63.5875, + "step": 1363, + "task_loss": 0.5139634013175964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.585387167807188, + "compression/movement_sparsity/importance_threshold": -0.002871911271826805, + "compression/movement_sparsity/linear_layer_sparsity": 0.2205499411804659, + "compression/movement_sparsity/model_sparsity": 0.2129733722099373, + "compression_loss": 63.069217681884766, + "distillation_loss": 0.9533974528312683, + "epoch": 1.15, + "learning_rate": 4.423499577345731e-05, + "loss": 63.9281, + "step": 1364, + "task_loss": 1.2177783250808716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5881173535612209, + "compression/movement_sparsity/importance_threshold": -0.0028633384080256384, + "compression/movement_sparsity/linear_layer_sparsity": 0.2226018519472452, + "compression/movement_sparsity/model_sparsity": 0.21495479352946187, + "compression_loss": 63.36244583129883, + "distillation_loss": 0.601244330406189, + "epoch": 1.15, + "learning_rate": 4.423076923076923e-05, + "loss": 64.0464, + "step": 1365, + "task_loss": 1.0718969106674194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5908421007000141, + "compression/movement_sparsity/importance_threshold": -0.00285478262163675, + "compression/movement_sparsity/linear_layer_sparsity": 0.22477162118693736, + "compression/movement_sparsity/model_sparsity": 0.2170500245207797, + "compression_loss": 63.65501403808594, + "distillation_loss": 0.6122133135795593, + "epoch": 1.15, + "learning_rate": 4.422654268808115e-05, + "loss": 64.4711, + "step": 1366, + "task_loss": 0.8908010125160217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5935614146459156, + "compression/movement_sparsity/importance_threshold": -0.0028462438956338067, + "compression/movement_sparsity/linear_layer_sparsity": 0.22698790660457704, + "compression/movement_sparsity/model_sparsity": 0.21919017371623203, + "compression_loss": 63.94700241088867, + "distillation_loss": 0.7002647519111633, + "epoch": 1.16, + "learning_rate": 4.422231614539307e-05, + "loss": 64.7485, + "step": 1367, + "task_loss": 0.7219159007072449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.596275300821274, + "compression/movement_sparsity/importance_threshold": -0.002837722212990474, + "compression/movement_sparsity/linear_layer_sparsity": 0.22913723782094136, + "compression/movement_sparsity/model_sparsity": 0.221265668793198, + "compression_loss": 64.23841094970703, + "distillation_loss": 0.7693737149238586, + "epoch": 1.16, + "learning_rate": 4.4218089602704985e-05, + "loss": 65.142, + "step": 1368, + "task_loss": 0.6063172221183777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5989837646484375, + "compression/movement_sparsity/importance_threshold": -0.002829217556680419, + "compression/movement_sparsity/linear_layer_sparsity": 0.23141529042693482, + "compression/movement_sparsity/model_sparsity": 0.22346546328406605, + "compression_loss": 64.52925109863281, + "distillation_loss": 0.6864047050476074, + "epoch": 1.16, + "learning_rate": 4.4213863060016905e-05, + "loss": 65.4941, + "step": 1369, + "task_loss": 0.6815376281738281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6016868115497541, + "compression/movement_sparsity/importance_threshold": -0.0028207299096773084, + "compression/movement_sparsity/linear_layer_sparsity": 0.2337316553835423, + "compression/movement_sparsity/model_sparsity": 0.22570225397844196, + "compression_loss": 64.81948852539062, + "distillation_loss": 0.6462836265563965, + "epoch": 1.16, + "learning_rate": 4.420963651732883e-05, + "loss": 65.8191, + "step": 1370, + "task_loss": 1.1219321489334106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6043844469475719, + "compression/movement_sparsity/importance_threshold": -0.00281225925495481, + "compression/movement_sparsity/linear_layer_sparsity": 0.23573174371777614, + "compression/movement_sparsity/model_sparsity": 0.22763363312540347, + "compression_loss": 65.109130859375, + "distillation_loss": 1.332759976387024, + "epoch": 1.16, + "learning_rate": 4.4205409974640744e-05, + "loss": 66.0725, + "step": 1371, + "task_loss": 1.4456849098205566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.607076676264239, + "compression/movement_sparsity/importance_threshold": -0.00280380557548659, + "compression/movement_sparsity/linear_layer_sparsity": 0.23804602194504734, + "compression/movement_sparsity/model_sparsity": 0.22986840877601533, + "compression_loss": 65.39823150634766, + "distillation_loss": 1.0189800262451172, + "epoch": 1.16, + "learning_rate": 4.4201183431952664e-05, + "loss": 66.222, + "step": 1372, + "task_loss": 0.9191982746124268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6097635049221041, + "compression/movement_sparsity/importance_threshold": -0.0027953688542463138, + "compression/movement_sparsity/linear_layer_sparsity": 0.24014831232008815, + "compression/movement_sparsity/model_sparsity": 0.23189847900927185, + "compression_loss": 65.68670654296875, + "distillation_loss": 0.9639915227890015, + "epoch": 1.16, + "learning_rate": 4.4196956889264584e-05, + "loss": 66.4569, + "step": 1373, + "task_loss": 1.041630506515503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6124449383435151, + "compression/movement_sparsity/importance_threshold": -0.00278694907420765, + "compression/movement_sparsity/linear_layer_sparsity": 0.24250219070807805, + "compression/movement_sparsity/model_sparsity": 0.23417149443325738, + "compression_loss": 65.97455596923828, + "distillation_loss": 1.028700828552246, + "epoch": 1.16, + "learning_rate": 4.4192730346576503e-05, + "loss": 66.8127, + "step": 1374, + "task_loss": 1.1086666584014893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6151209819508199, + "compression/movement_sparsity/importance_threshold": -0.0027785462183442643, + "compression/movement_sparsity/linear_layer_sparsity": 0.2447815668966791, + "compression/movement_sparsity/model_sparsity": 0.2363725670375986, + "compression_loss": 66.2618408203125, + "distillation_loss": 0.7350916862487793, + "epoch": 1.16, + "learning_rate": 4.418850380388842e-05, + "loss": 66.8838, + "step": 1375, + "task_loss": 0.8811807632446289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6177916411663673, + "compression/movement_sparsity/importance_threshold": -0.0027701602696298235, + "compression/movement_sparsity/linear_layer_sparsity": 0.24691848505453445, + "compression/movement_sparsity/model_sparsity": 0.23843607548280246, + "compression_loss": 66.54849243164062, + "distillation_loss": 0.5729374885559082, + "epoch": 1.16, + "learning_rate": 4.418427726120034e-05, + "loss": 67.3083, + "step": 1376, + "task_loss": 1.3767502307891846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6204569214125051, + "compression/movement_sparsity/importance_threshold": -0.002761791211037994, + "compression/movement_sparsity/linear_layer_sparsity": 0.2489480976278347, + "compression/movement_sparsity/model_sparsity": 0.24039596462039126, + "compression_loss": 66.83454895019531, + "distillation_loss": 0.9451680183410645, + "epoch": 1.16, + "learning_rate": 4.418005071851226e-05, + "loss": 67.8146, + "step": 1377, + "task_loss": 1.2235774993896484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6231168281115813, + "compression/movement_sparsity/importance_threshold": -0.002753439025542444, + "compression/movement_sparsity/linear_layer_sparsity": 0.2511811245768351, + "compression/movement_sparsity/model_sparsity": 0.2425522802240991, + "compression_loss": 67.12010955810547, + "distillation_loss": 1.1316816806793213, + "epoch": 1.16, + "learning_rate": 4.4175824175824176e-05, + "loss": 68.3442, + "step": 1378, + "task_loss": 0.37538233399391174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6257713666859449, + "compression/movement_sparsity/importance_threshold": -0.002745103696116838, + "compression/movement_sparsity/linear_layer_sparsity": 0.2534306188013407, + "compression/movement_sparsity/model_sparsity": 0.24472449740173924, + "compression_loss": 67.40505981445312, + "distillation_loss": 0.8693324327468872, + "epoch": 1.17, + "learning_rate": 4.4171597633136095e-05, + "loss": 68.1973, + "step": 1379, + "task_loss": 0.3585340678691864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6284205425579432, + "compression/movement_sparsity/importance_threshold": -0.0027367852057348444, + "compression/movement_sparsity/linear_layer_sparsity": 0.2556787178982329, + "compression/movement_sparsity/model_sparsity": 0.2468953673786914, + "compression_loss": 67.68950653076172, + "distillation_loss": 0.7742630839347839, + "epoch": 1.17, + "learning_rate": 4.4167371090448015e-05, + "loss": 68.546, + "step": 1380, + "task_loss": 0.22085602581501007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6310643611499249, + "compression/movement_sparsity/importance_threshold": -0.0027284835373701296, + "compression/movement_sparsity/linear_layer_sparsity": 0.2576685752966351, + "compression/movement_sparsity/model_sparsity": 0.24881686705394118, + "compression_loss": 67.97335815429688, + "distillation_loss": 1.2497050762176514, + "epoch": 1.17, + "learning_rate": 4.4163144547759935e-05, + "loss": 69.1566, + "step": 1381, + "task_loss": 1.7637649774551392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.633702827884238, + "compression/movement_sparsity/importance_threshold": -0.0027201986739963597, + "compression/movement_sparsity/linear_layer_sparsity": 0.2598670937044974, + "compression/movement_sparsity/model_sparsity": 0.25093985959105963, + "compression_loss": 68.25659942626953, + "distillation_loss": 1.1690393686294556, + "epoch": 1.17, + "learning_rate": 4.4158918005071855e-05, + "loss": 69.3858, + "step": 1382, + "task_loss": 0.432060569524765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.636335948183231, + "compression/movement_sparsity/importance_threshold": -0.0027119305985872015, + "compression/movement_sparsity/linear_layer_sparsity": 0.2618242073385716, + "compression/movement_sparsity/model_sparsity": 0.25282974035101763, + "compression_loss": 68.53935241699219, + "distillation_loss": 1.0316088199615479, + "epoch": 1.17, + "learning_rate": 4.4154691462383774e-05, + "loss": 69.5442, + "step": 1383, + "task_loss": 1.9555219411849976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6389637274692515, + "compression/movement_sparsity/importance_threshold": -0.002703679294116324, + "compression/movement_sparsity/linear_layer_sparsity": 0.2638772866737792, + "compression/movement_sparsity/model_sparsity": 0.2548122900950501, + "compression_loss": 68.82148742675781, + "distillation_loss": 1.121019959449768, + "epoch": 1.17, + "learning_rate": 4.415046491969569e-05, + "loss": 69.6437, + "step": 1384, + "task_loss": 1.2901555299758911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6415861711646482, + "compression/movement_sparsity/importance_threshold": -0.0026954447435573897, + "compression/movement_sparsity/linear_layer_sparsity": 0.2661049477473435, + "compression/movement_sparsity/model_sparsity": 0.2569634241576504, + "compression_loss": 69.10302734375, + "distillation_loss": 0.8796181678771973, + "epoch": 1.17, + "learning_rate": 4.414623837700761e-05, + "loss": 70.1302, + "step": 1385, + "task_loss": 1.7014614343643188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6442032846917692, + "compression/movement_sparsity/importance_threshold": -0.0026872269298840686, + "compression/movement_sparsity/linear_layer_sparsity": 0.26821227012112664, + "compression/movement_sparsity/model_sparsity": 0.2589983535250122, + "compression_loss": 69.3840103149414, + "distillation_loss": 0.8436229228973389, + "epoch": 1.17, + "learning_rate": 4.414201183431953e-05, + "loss": 70.2836, + "step": 1386, + "task_loss": 0.7809646725654602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6468150734729627, + "compression/movement_sparsity/importance_threshold": -0.0026790258360700264, + "compression/movement_sparsity/linear_layer_sparsity": 0.2704441404258664, + "compression/movement_sparsity/model_sparsity": 0.261153552218748, + "compression_loss": 69.66437530517578, + "distillation_loss": 0.48853152990341187, + "epoch": 1.17, + "learning_rate": 4.4137785291631447e-05, + "loss": 70.4585, + "step": 1387, + "task_loss": 0.12014459818601608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6494215429305765, + "compression/movement_sparsity/importance_threshold": -0.0026708414450889303, + "compression/movement_sparsity/linear_layer_sparsity": 0.2725881534634651, + "compression/movement_sparsity/model_sparsity": 0.2632239118127496, + "compression_loss": 69.94415283203125, + "distillation_loss": 0.686325192451477, + "epoch": 1.17, + "learning_rate": 4.4133558748943366e-05, + "loss": 70.7651, + "step": 1388, + "task_loss": 0.9187728762626648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6520226984869595, + "compression/movement_sparsity/importance_threshold": -0.0026626737399144456, + "compression/movement_sparsity/linear_layer_sparsity": 0.2747935759643885, + "compression/movement_sparsity/model_sparsity": 0.2653535712660931, + "compression_loss": 70.22332000732422, + "distillation_loss": 0.7436723709106445, + "epoch": 1.17, + "learning_rate": 4.4129332206255286e-05, + "loss": 71.1187, + "step": 1389, + "task_loss": 0.863633930683136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6546185455644595, + "compression/movement_sparsity/importance_threshold": -0.00265452270352024, + "compression/movement_sparsity/linear_layer_sparsity": 0.2772152909420588, + "compression/movement_sparsity/model_sparsity": 0.26769209288421375, + "compression_loss": 70.50190734863281, + "distillation_loss": 0.8926471471786499, + "epoch": 1.17, + "learning_rate": 4.4125105663567206e-05, + "loss": 71.53, + "step": 1390, + "task_loss": 0.5865926146507263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6572090895854248, + "compression/movement_sparsity/importance_threshold": -0.0026463883188799805, + "compression/movement_sparsity/linear_layer_sparsity": 0.2794186147894783, + "compression/movement_sparsity/model_sparsity": 0.26981972577925745, + "compression_loss": 70.77996063232422, + "distillation_loss": 0.6902129650115967, + "epoch": 1.18, + "learning_rate": 4.412087912087912e-05, + "loss": 71.827, + "step": 1391, + "task_loss": 0.9052955508232117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6597943359722029, + "compression/movement_sparsity/importance_threshold": -0.002638270568967335, + "compression/movement_sparsity/linear_layer_sparsity": 0.2815503340102445, + "compression/movement_sparsity/model_sparsity": 0.2718782138868549, + "compression_loss": 71.05741119384766, + "distillation_loss": 1.233741283416748, + "epoch": 1.18, + "learning_rate": 4.4116652578191045e-05, + "loss": 72.0276, + "step": 1392, + "task_loss": 1.1508111953735352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6623742901471432, + "compression/movement_sparsity/importance_threshold": -0.0026301694367559672, + "compression/movement_sparsity/linear_layer_sparsity": 0.2837227146426489, + "compression/movement_sparsity/model_sparsity": 0.2739759665615117, + "compression_loss": 71.33426666259766, + "distillation_loss": 0.8052124977111816, + "epoch": 1.18, + "learning_rate": 4.4112426035502965e-05, + "loss": 72.2643, + "step": 1393, + "task_loss": 0.5793078541755676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6649489575325933, + "compression/movement_sparsity/importance_threshold": -0.002622084905219545, + "compression/movement_sparsity/linear_layer_sparsity": 0.2860477723174629, + "compression/movement_sparsity/model_sparsity": 0.2762211513524818, + "compression_loss": 71.61056518554688, + "distillation_loss": 1.206644058227539, + "epoch": 1.18, + "learning_rate": 4.410819949281488e-05, + "loss": 72.6324, + "step": 1394, + "task_loss": 1.0528587102890015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.667518343550901, + "compression/movement_sparsity/importance_threshold": -0.002614016957331737, + "compression/movement_sparsity/linear_layer_sparsity": 0.28834676376182494, + "compression/movement_sparsity/model_sparsity": 0.27844116536820507, + "compression_loss": 71.8862533569336, + "distillation_loss": 1.1972408294677734, + "epoch": 1.18, + "learning_rate": 4.41039729501268e-05, + "loss": 72.8677, + "step": 1395, + "task_loss": 0.870022177696228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6700824536244152, + "compression/movement_sparsity/importance_threshold": -0.0026059655760662078, + "compression/movement_sparsity/linear_layer_sparsity": 0.29041409247735744, + "compression/movement_sparsity/model_sparsity": 0.28043747498251204, + "compression_loss": 72.16140747070312, + "distillation_loss": 2.1123623847961426, + "epoch": 1.18, + "learning_rate": 4.409974640743872e-05, + "loss": 73.4825, + "step": 1396, + "task_loss": 1.6239508390426636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6726412931754838, + "compression/movement_sparsity/importance_threshold": -0.0025979307443966242, + "compression/movement_sparsity/linear_layer_sparsity": 0.2927393647871889, + "compression/movement_sparsity/model_sparsity": 0.28268286703512646, + "compression_loss": 72.43602752685547, + "distillation_loss": 0.9762169122695923, + "epoch": 1.18, + "learning_rate": 4.409551986475064e-05, + "loss": 73.3545, + "step": 1397, + "task_loss": 1.3902076482772827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6751948676264545, + "compression/movement_sparsity/importance_threshold": -0.0025899124452966552, + "compression/movement_sparsity/linear_layer_sparsity": 0.29480649079187154, + "compression/movement_sparsity/model_sparsity": 0.28467898090232485, + "compression_loss": 72.71007537841797, + "distillation_loss": 0.7956358194351196, + "epoch": 1.18, + "learning_rate": 4.409129332206256e-05, + "loss": 73.6023, + "step": 1398, + "task_loss": 0.37409406900405884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6777431823996767, + "compression/movement_sparsity/importance_threshold": -0.002581910661739964, + "compression/movement_sparsity/linear_layer_sparsity": 0.2970966583521831, + "compression/movement_sparsity/model_sparsity": 0.28689047416156016, + "compression_loss": 72.98355865478516, + "distillation_loss": 1.7923378944396973, + "epoch": 1.18, + "learning_rate": 4.408706677937448e-05, + "loss": 74.4578, + "step": 1399, + "task_loss": 1.2904648780822754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6802862429174975, + "compression/movement_sparsity/importance_threshold": -0.0025739253767002195, + "compression/movement_sparsity/linear_layer_sparsity": 0.2993565623750347, + "compression/movement_sparsity/model_sparsity": 0.2890727435289489, + "compression_loss": 73.2564697265625, + "distillation_loss": 1.7499198913574219, + "epoch": 1.18, + "learning_rate": 4.408284023668639e-05, + "loss": 74.432, + "step": 1400, + "task_loss": 1.7201018333435059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6828240546022653, + "compression/movement_sparsity/importance_threshold": -0.002565956573151089, + "compression/movement_sparsity/linear_layer_sparsity": 0.30159656496210213, + "compression/movement_sparsity/model_sparsity": 0.2912357951360966, + "compression_loss": 73.52880859375, + "distillation_loss": 0.9857583045959473, + "epoch": 1.18, + "learning_rate": 4.407861369399831e-05, + "loss": 74.4157, + "step": 1401, + "task_loss": 0.9768823385238647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6853566228763288, + "compression/movement_sparsity/importance_threshold": -0.0025580042340662373, + "compression/movement_sparsity/linear_layer_sparsity": 0.3037310148172569, + "compression/movement_sparsity/model_sparsity": 0.29329692007239094, + "compression_loss": 73.80050659179688, + "distillation_loss": 1.4733920097351074, + "epoch": 1.19, + "learning_rate": 4.407438715131023e-05, + "loss": 74.9868, + "step": 1402, + "task_loss": 0.8372330069541931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.687883953162036, + "compression/movement_sparsity/importance_threshold": -0.0025500683424193316, + "compression/movement_sparsity/linear_layer_sparsity": 0.30602875422401715, + "compression/movement_sparsity/model_sparsity": 0.2955157250618558, + "compression_loss": 74.07170104980469, + "distillation_loss": 0.8430300354957581, + "epoch": 1.19, + "learning_rate": 4.407016060862215e-05, + "loss": 75.0115, + "step": 1403, + "task_loss": 1.564890742301941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6904060508817345, + "compression/movement_sparsity/importance_threshold": -0.002542148881184041, + "compression/movement_sparsity/linear_layer_sparsity": 0.30824122390801334, + "compression/movement_sparsity/model_sparsity": 0.2976521896058539, + "compression_loss": 74.3423080444336, + "distillation_loss": 1.175536036491394, + "epoch": 1.19, + "learning_rate": 4.406593406593407e-05, + "loss": 75.3343, + "step": 1404, + "task_loss": 0.9068008661270142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6929229214577731, + "compression/movement_sparsity/importance_threshold": -0.0025342458333340286, + "compression/movement_sparsity/linear_layer_sparsity": 0.31055848317719353, + "compression/movement_sparsity/model_sparsity": 0.29988984389041434, + "compression_loss": 74.61233520507812, + "distillation_loss": 1.0687832832336426, + "epoch": 1.19, + "learning_rate": 4.406170752324599e-05, + "loss": 75.6561, + "step": 1405, + "task_loss": 0.6772088408470154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6954345703125, + "compression/movement_sparsity/importance_threshold": -0.002526359181842963, + "compression/movement_sparsity/linear_layer_sparsity": 0.31300005189397756, + "compression/movement_sparsity/model_sparsity": 0.30224753721063297, + "compression_loss": 74.88172149658203, + "distillation_loss": 0.8092421293258667, + "epoch": 1.19, + "learning_rate": 4.405748098055791e-05, + "loss": 76.1355, + "step": 1406, + "task_loss": 1.1415399312973022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6979410028682633, + "compression/movement_sparsity/importance_threshold": -0.002518488909684511, + "compression/movement_sparsity/linear_layer_sparsity": 0.3151791935294315, + "compression/movement_sparsity/model_sparsity": 0.30435181862708527, + "compression_loss": 75.15054321289062, + "distillation_loss": 0.9002688527107239, + "epoch": 1.19, + "learning_rate": 4.405325443786982e-05, + "loss": 76.1927, + "step": 1407, + "task_loss": 0.997353196144104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.700442224547411, + "compression/movement_sparsity/importance_threshold": -0.002510634999832339, + "compression/movement_sparsity/linear_layer_sparsity": 0.31749029992811156, + "compression/movement_sparsity/model_sparsity": 0.3065835314111758, + "compression_loss": 75.4188232421875, + "distillation_loss": 0.8566809296607971, + "epoch": 1.19, + "learning_rate": 4.404902789518174e-05, + "loss": 76.8891, + "step": 1408, + "task_loss": 1.3952616453170776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7029382407722911, + "compression/movement_sparsity/importance_threshold": -0.002502797435260115, + "compression/movement_sparsity/linear_layer_sparsity": 0.31967678685082923, + "compression/movement_sparsity/model_sparsity": 0.3086949057816776, + "compression_loss": 75.6865463256836, + "distillation_loss": 1.2107224464416504, + "epoch": 1.19, + "learning_rate": 4.404480135249367e-05, + "loss": 76.9155, + "step": 1409, + "task_loss": 1.0626639127731323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7054290569652526, + "compression/movement_sparsity/importance_threshold": -0.0024949761989415033, + "compression/movement_sparsity/linear_layer_sparsity": 0.3218799318357342, + "compression/movement_sparsity/model_sparsity": 0.3108223659586843, + "compression_loss": 75.95377349853516, + "distillation_loss": 1.1467063426971436, + "epoch": 1.19, + "learning_rate": 4.404057480980558e-05, + "loss": 77.4632, + "step": 1410, + "task_loss": 1.2172901630401611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7079146785486432, + "compression/movement_sparsity/importance_threshold": -0.002487171273850172, + "compression/movement_sparsity/linear_layer_sparsity": 0.3240487113695126, + "compression/movement_sparsity/model_sparsity": 0.3129166412435312, + "compression_loss": 76.22038269042969, + "distillation_loss": 1.2668073177337646, + "epoch": 1.19, + "learning_rate": 4.40363482671175e-05, + "loss": 77.3292, + "step": 1411, + "task_loss": 1.6572024822235107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7103951109448107, + "compression/movement_sparsity/importance_threshold": -0.0024793826429597887, + "compression/movement_sparsity/linear_layer_sparsity": 0.32622865192419814, + "compression/movement_sparsity/model_sparsity": 0.31502169413388176, + "compression_loss": 76.48645782470703, + "distillation_loss": 1.4000005722045898, + "epoch": 1.19, + "learning_rate": 4.403212172442942e-05, + "loss": 78.1424, + "step": 1412, + "task_loss": 1.791578769683838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7128703595761041, + "compression/movement_sparsity/importance_threshold": -0.002471610289244018, + "compression/movement_sparsity/linear_layer_sparsity": 0.3284364115619782, + "compression/movement_sparsity/model_sparsity": 0.31715361043624096, + "compression_loss": 76.75199127197266, + "distillation_loss": 0.8285961151123047, + "epoch": 1.19, + "learning_rate": 4.402789518174133e-05, + "loss": 77.8442, + "step": 1413, + "task_loss": 1.348632574081421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7153404298648711, + "compression/movement_sparsity/importance_threshold": -0.002463854195676528, + "compression/movement_sparsity/linear_layer_sparsity": 0.33053977511210625, + "compression/movement_sparsity/model_sparsity": 0.319184716977719, + "compression_loss": 77.01703643798828, + "distillation_loss": 1.1366537809371948, + "epoch": 1.2, + "learning_rate": 4.402366863905326e-05, + "loss": 77.9873, + "step": 1414, + "task_loss": 1.0489082336425781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7178053272334597, + "compression/movement_sparsity/importance_threshold": -0.002456114345230986, + "compression/movement_sparsity/linear_layer_sparsity": 0.3328285356206367, + "compression/movement_sparsity/model_sparsity": 0.32139485152173053, + "compression_loss": 77.28144836425781, + "distillation_loss": 1.08726167678833, + "epoch": 1.2, + "learning_rate": 4.401944209636518e-05, + "loss": 78.482, + "step": 1415, + "task_loss": 0.9780789017677307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7202650571042188, + "compression/movement_sparsity/importance_threshold": -0.0024483907208810566, + "compression/movement_sparsity/linear_layer_sparsity": 0.3350009281772088, + "compression/movement_sparsity/model_sparsity": 0.32349261571092314, + "compression_loss": 77.5453872680664, + "distillation_loss": 1.2207279205322266, + "epoch": 1.2, + "learning_rate": 4.401521555367709e-05, + "loss": 78.9881, + "step": 1416, + "task_loss": 0.6573187112808228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7227196248994963, + "compression/movement_sparsity/importance_threshold": -0.0024406833056004077, + "compression/movement_sparsity/linear_layer_sparsity": 0.3372261209480724, + "compression/movement_sparsity/model_sparsity": 0.325641366264614, + "compression_loss": 77.80865478515625, + "distillation_loss": 1.2851381301879883, + "epoch": 1.2, + "learning_rate": 4.401098901098901e-05, + "loss": 78.8852, + "step": 1417, + "task_loss": 1.7367502450942993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7251690360416397, + "compression/movement_sparsity/importance_threshold": -0.002432992082362708, + "compression/movement_sparsity/linear_layer_sparsity": 0.3393983465662976, + "compression/movement_sparsity/model_sparsity": 0.3277389692503055, + "compression_loss": 78.07147216796875, + "distillation_loss": 0.9510504007339478, + "epoch": 1.2, + "learning_rate": 4.400676246830093e-05, + "loss": 79.328, + "step": 1418, + "task_loss": 0.9809507727622986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7276132959529984, + "compression/movement_sparsity/importance_threshold": -0.0024253170341416203, + "compression/movement_sparsity/linear_layer_sparsity": 0.3416635568437472, + "compression/movement_sparsity/model_sparsity": 0.32992636258612285, + "compression_loss": 78.33370208740234, + "distillation_loss": 1.3271602392196655, + "epoch": 1.2, + "learning_rate": 4.400253592561285e-05, + "loss": 79.8687, + "step": 1419, + "task_loss": 1.0307344198226929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7300524100559197, + "compression/movement_sparsity/importance_threshold": -0.0024176581439108143, + "compression/movement_sparsity/linear_layer_sparsity": 0.3439886979877347, + "compression/movement_sparsity/model_sparsity": 0.33217162797884353, + "compression_loss": 78.59539794921875, + "distillation_loss": 1.4630053043365479, + "epoch": 1.2, + "learning_rate": 4.399830938292477e-05, + "loss": 80.0691, + "step": 1420, + "task_loss": 1.008732795715332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7324863837727519, + "compression/movement_sparsity/importance_threshold": -0.002410015394643956, + "compression/movement_sparsity/linear_layer_sparsity": 0.3464207154462424, + "compression/movement_sparsity/model_sparsity": 0.3345200981558907, + "compression_loss": 78.85662841796875, + "distillation_loss": 0.7131186127662659, + "epoch": 1.2, + "learning_rate": 4.399408284023669e-05, + "loss": 80.2534, + "step": 1421, + "task_loss": 0.5300425291061401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7349152225258437, + "compression/movement_sparsity/importance_threshold": -0.0024023887693147114, + "compression/movement_sparsity/linear_layer_sparsity": 0.348494328198119, + "compression/movement_sparsity/model_sparsity": 0.33652247593056134, + "compression_loss": 79.11727142333984, + "distillation_loss": 1.4803380966186523, + "epoch": 1.2, + "learning_rate": 4.398985629754861e-05, + "loss": 80.7196, + "step": 1422, + "task_loss": 1.6573938131332397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7373389317375428, + "compression/movement_sparsity/importance_threshold": -0.0023947782508967476, + "compression/movement_sparsity/linear_layer_sparsity": 0.3507736566900495, + "compression/movement_sparsity/model_sparsity": 0.33872350247675936, + "compression_loss": 79.3774185180664, + "distillation_loss": 0.7501176595687866, + "epoch": 1.2, + "learning_rate": 4.398562975486052e-05, + "loss": 80.4505, + "step": 1423, + "task_loss": 0.9124040007591248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7397575168301977, + "compression/movement_sparsity/importance_threshold": -0.0023871838223637316, + "compression/movement_sparsity/linear_layer_sparsity": 0.35284684017189116, + "compression/movement_sparsity/model_sparsity": 0.34072546572814144, + "compression_loss": 79.6369400024414, + "distillation_loss": 1.8759551048278809, + "epoch": 1.2, + "learning_rate": 4.398140321217244e-05, + "loss": 81.0921, + "step": 1424, + "task_loss": 1.4439818859100342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7421709832261562, + "compression/movement_sparsity/importance_threshold": -0.0023796054666893308, + "compression/movement_sparsity/linear_layer_sparsity": 0.354913763465724, + "compression/movement_sparsity/model_sparsity": 0.3427213838482313, + "compression_loss": 79.89598846435547, + "distillation_loss": 1.3793349266052246, + "epoch": 1.2, + "learning_rate": 4.397717666948436e-05, + "loss": 81.4144, + "step": 1425, + "task_loss": 1.0169471502304077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.744579336347767, + "compression/movement_sparsity/importance_threshold": -0.00237204316684721, + "compression/movement_sparsity/linear_layer_sparsity": 0.3569735322589754, + "compression/movement_sparsity/model_sparsity": 0.3447103932468445, + "compression_loss": 80.1545181274414, + "distillation_loss": 1.1719056367874146, + "epoch": 1.21, + "learning_rate": 4.397295012679628e-05, + "loss": 81.2686, + "step": 1426, + "task_loss": 0.5395320653915405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7469825816173778, + "compression/movement_sparsity/importance_threshold": -0.0023644969058110375, + "compression/movement_sparsity/linear_layer_sparsity": 0.3590391558185359, + "compression/movement_sparsity/model_sparsity": 0.3467050562825328, + "compression_loss": 80.41238403320312, + "distillation_loss": 1.6068623065948486, + "epoch": 1.21, + "learning_rate": 4.39687235841082e-05, + "loss": 81.7638, + "step": 1427, + "task_loss": 1.4671213626861572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7493807244573374, + "compression/movement_sparsity/importance_threshold": -0.002356966666554479, + "compression/movement_sparsity/linear_layer_sparsity": 0.361016707475938, + "compression/movement_sparsity/model_sparsity": 0.3486146729568426, + "compression_loss": 80.669677734375, + "distillation_loss": 2.0336830615997314, + "epoch": 1.21, + "learning_rate": 4.396449704142012e-05, + "loss": 82.1165, + "step": 1428, + "task_loss": 1.1140927076339722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7517737702899931, + "compression/movement_sparsity/importance_threshold": -0.0023494524320512027, + "compression/movement_sparsity/linear_layer_sparsity": 0.3631738609462714, + "compression/movement_sparsity/model_sparsity": 0.35069772156928986, + "compression_loss": 80.92645263671875, + "distillation_loss": 1.7074615955352783, + "epoch": 1.21, + "learning_rate": 4.3960270498732035e-05, + "loss": 82.1109, + "step": 1429, + "task_loss": 0.8876955509185791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7541617245376939, + "compression/movement_sparsity/importance_threshold": -0.0023419541852748743, + "compression/movement_sparsity/linear_layer_sparsity": 0.36523402323705473, + "compression/movement_sparsity/model_sparsity": 0.35268711094758426, + "compression_loss": 81.18262481689453, + "distillation_loss": 1.5673458576202393, + "epoch": 1.21, + "learning_rate": 4.3956043956043955e-05, + "loss": 82.2949, + "step": 1430, + "task_loss": 0.6854345798492432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.756544592622788, + "compression/movement_sparsity/importance_threshold": -0.0023344719091991594, + "compression/movement_sparsity/linear_layer_sparsity": 0.36724220808111335, + "compression/movement_sparsity/model_sparsity": 0.3546263084643502, + "compression_loss": 81.43828582763672, + "distillation_loss": 0.9449591636657715, + "epoch": 1.21, + "learning_rate": 4.395181741335588e-05, + "loss": 82.4104, + "step": 1431, + "task_loss": 0.7326794862747192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7589223799676228, + "compression/movement_sparsity/importance_threshold": -0.0023270055867977276, + "compression/movement_sparsity/linear_layer_sparsity": 0.3694161984761486, + "compression/movement_sparsity/model_sparsity": 0.3567256156013393, + "compression_loss": 81.69331359863281, + "distillation_loss": 1.7695385217666626, + "epoch": 1.21, + "learning_rate": 4.39475908706678e-05, + "loss": 83.2207, + "step": 1432, + "task_loss": 1.5717283487319946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7612950919945474, + "compression/movement_sparsity/importance_threshold": -0.002319555201044243, + "compression/movement_sparsity/linear_layer_sparsity": 0.3716317565195625, + "compression/movement_sparsity/model_sparsity": 0.3588650624101082, + "compression_loss": 81.94783020019531, + "distillation_loss": 1.4127122163772583, + "epoch": 1.21, + "learning_rate": 4.3943364327979714e-05, + "loss": 83.2395, + "step": 1433, + "task_loss": 1.1738373041152954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7636627341259096, + "compression/movement_sparsity/importance_threshold": -0.0023121207349123727, + "compression/movement_sparsity/linear_layer_sparsity": 0.37405204059711655, + "compression/movement_sparsity/model_sparsity": 0.3612022022839335, + "compression_loss": 82.20179748535156, + "distillation_loss": 1.1583659648895264, + "epoch": 1.21, + "learning_rate": 4.3939137785291634e-05, + "loss": 83.6197, + "step": 1434, + "task_loss": 0.5853502154350281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7660253117840572, + "compression/movement_sparsity/importance_threshold": -0.002304702171375786, + "compression/movement_sparsity/linear_layer_sparsity": 0.37599286581820013, + "compression/movement_sparsity/model_sparsity": 0.36307635418799616, + "compression_loss": 82.4552001953125, + "distillation_loss": 0.9428606629371643, + "epoch": 1.21, + "learning_rate": 4.393491124260355e-05, + "loss": 83.542, + "step": 1435, + "task_loss": 0.3349272906780243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7683828303913391, + "compression/movement_sparsity/importance_threshold": -0.0022972994934081455, + "compression/movement_sparsity/linear_layer_sparsity": 0.37809954428693093, + "compression/movement_sparsity/model_sparsity": 0.36511066177042506, + "compression_loss": 82.7081527709961, + "distillation_loss": 1.9129095077514648, + "epoch": 1.21, + "learning_rate": 4.393068469991547e-05, + "loss": 84.0512, + "step": 1436, + "task_loss": 0.9742015600204468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7707352953701033, + "compression/movement_sparsity/importance_threshold": -0.002289912683983121, + "compression/movement_sparsity/linear_layer_sparsity": 0.38036760444044554, + "compression/movement_sparsity/model_sparsity": 0.3673008070802973, + "compression_loss": 82.96049499511719, + "distillation_loss": 1.9891302585601807, + "epoch": 1.21, + "learning_rate": 4.392645815722739e-05, + "loss": 84.2558, + "step": 1437, + "task_loss": 1.8122395277023315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7730827121426974, + "compression/movement_sparsity/importance_threshold": -0.002282541726074379, + "compression/movement_sparsity/linear_layer_sparsity": 0.382474032501656, + "compression/movement_sparsity/model_sparsity": 0.3693348728574745, + "compression_loss": 83.2123031616211, + "distillation_loss": 2.6006221771240234, + "epoch": 1.22, + "learning_rate": 4.392223161453931e-05, + "loss": 85.2433, + "step": 1438, + "task_loss": 2.2420737743377686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7754250861314707, + "compression/movement_sparsity/importance_threshold": -0.002275186602655584, + "compression/movement_sparsity/linear_layer_sparsity": 0.38450653072352414, + "compression/movement_sparsity/model_sparsity": 0.37129754851272556, + "compression_loss": 83.46353912353516, + "distillation_loss": 1.7914044857025146, + "epoch": 1.22, + "learning_rate": 4.3918005071851226e-05, + "loss": 85.5608, + "step": 1439, + "task_loss": 1.5856698751449585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7777624227587706, + "compression/movement_sparsity/importance_threshold": -0.0022678472967004046, + "compression/movement_sparsity/linear_layer_sparsity": 0.3863972625163695, + "compression/movement_sparsity/model_sparsity": 0.3731233278519153, + "compression_loss": 83.71437072753906, + "distillation_loss": 2.3265724182128906, + "epoch": 1.22, + "learning_rate": 4.3913778529163145e-05, + "loss": 85.3337, + "step": 1440, + "task_loss": 1.059078335762024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.780094727446945, + "compression/movement_sparsity/importance_threshold": -0.0022605237911825084, + "compression/movement_sparsity/linear_layer_sparsity": 0.38825155405291967, + "compression/movement_sparsity/model_sparsity": 0.37491391876971697, + "compression_loss": 83.96455383300781, + "distillation_loss": 1.8383489847183228, + "epoch": 1.22, + "learning_rate": 4.3909551986475065e-05, + "loss": 85.5167, + "step": 1441, + "task_loss": 0.8957647085189819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7824220056183431, + "compression/movement_sparsity/importance_threshold": -0.0022532160690755592, + "compression/movement_sparsity/linear_layer_sparsity": 0.3900560741137577, + "compression/movement_sparsity/model_sparsity": 0.3766564480151122, + "compression_loss": 84.21428680419922, + "distillation_loss": 1.6380958557128906, + "epoch": 1.22, + "learning_rate": 4.3905325443786985e-05, + "loss": 85.6299, + "step": 1442, + "task_loss": 1.5284394025802612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7847442626953125, + "compression/movement_sparsity/importance_threshold": -0.002245924113353226, + "compression/movement_sparsity/linear_layer_sparsity": 0.3922005521938942, + "compression/movement_sparsity/model_sparsity": 0.3787272566760098, + "compression_loss": 84.46338653564453, + "distillation_loss": 0.9237058758735657, + "epoch": 1.22, + "learning_rate": 4.3901098901098904e-05, + "loss": 86.0037, + "step": 1443, + "task_loss": 0.7917584180831909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7870615041002014, + "compression/movement_sparsity/importance_threshold": -0.002238647906989175, + "compression/movement_sparsity/linear_layer_sparsity": 0.3941746577668495, + "compression/movement_sparsity/model_sparsity": 0.38063354564947505, + "compression_loss": 84.71204376220703, + "distillation_loss": 1.927260398864746, + "epoch": 1.22, + "learning_rate": 4.3896872358410824e-05, + "loss": 86.077, + "step": 1444, + "task_loss": 1.1530635356903076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7893737352553581, + "compression/movement_sparsity/importance_threshold": -0.002231387432957072, + "compression/movement_sparsity/linear_layer_sparsity": 0.3961089962407392, + "compression/movement_sparsity/model_sparsity": 0.3825014336460654, + "compression_loss": 84.96006774902344, + "distillation_loss": 2.230501174926758, + "epoch": 1.22, + "learning_rate": 4.389264581572274e-05, + "loss": 86.6381, + "step": 1445, + "task_loss": 1.447485327720642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7916809615831306, + "compression/movement_sparsity/importance_threshold": -0.002224142674230586, + "compression/movement_sparsity/linear_layer_sparsity": 0.39809811434074804, + "compression/movement_sparsity/model_sparsity": 0.38442221942009597, + "compression_loss": 85.20756530761719, + "distillation_loss": 1.4549471139907837, + "epoch": 1.22, + "learning_rate": 4.388841927303466e-05, + "loss": 86.703, + "step": 1446, + "task_loss": 1.2901946306228638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7939831885058674, + "compression/movement_sparsity/importance_threshold": -0.0022169136137833814, + "compression/movement_sparsity/linear_layer_sparsity": 0.3998693778980497, + "compression/movement_sparsity/model_sparsity": 0.38613263462516023, + "compression_loss": 85.45455169677734, + "distillation_loss": 1.4603662490844727, + "epoch": 1.22, + "learning_rate": 4.388419273034658e-05, + "loss": 87.1138, + "step": 1447, + "task_loss": 1.4821934700012207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7962804214459166, + "compression/movement_sparsity/importance_threshold": -0.0022097002345891252, + "compression/movement_sparsity/linear_layer_sparsity": 0.40187120716075836, + "compression/movement_sparsity/model_sparsity": 0.3880656948943477, + "compression_loss": 85.70106506347656, + "distillation_loss": 1.9548306465148926, + "epoch": 1.22, + "learning_rate": 4.38799661876585e-05, + "loss": 87.0557, + "step": 1448, + "task_loss": 1.3373868465423584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7985726658256258, + "compression/movement_sparsity/importance_threshold": -0.0022025025196214863, + "compression/movement_sparsity/linear_layer_sparsity": 0.40381174620181876, + "compression/movement_sparsity/model_sparsity": 0.3899395704495513, + "compression_loss": 85.94697570800781, + "distillation_loss": 1.0626968145370483, + "epoch": 1.22, + "learning_rate": 4.3875739644970416e-05, + "loss": 87.6983, + "step": 1449, + "task_loss": 0.2939516305923462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8008599270673442, + "compression/movement_sparsity/importance_threshold": -0.002195320451854129, + "compression/movement_sparsity/linear_layer_sparsity": 0.4057403133785727, + "compression/movement_sparsity/model_sparsity": 0.3918018854108172, + "compression_loss": 86.19231414794922, + "distillation_loss": 2.3794965744018555, + "epoch": 1.23, + "learning_rate": 4.3871513102282336e-05, + "loss": 88.0952, + "step": 1450, + "task_loss": 1.582653284072876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8031422105934196, + "compression/movement_sparsity/importance_threshold": -0.0021881540142607205, + "compression/movement_sparsity/linear_layer_sparsity": 0.40775039416528536, + "compression/movement_sparsity/model_sparsity": 0.3937429137387745, + "compression_loss": 86.43717193603516, + "distillation_loss": 1.5740033388137817, + "epoch": 1.23, + "learning_rate": 4.3867286559594256e-05, + "loss": 88.1893, + "step": 1451, + "task_loss": 1.5919039249420166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8054195218261998, + "compression/movement_sparsity/importance_threshold": -0.0021810031898149285, + "compression/movement_sparsity/linear_layer_sparsity": 0.40968184699060717, + "compression/movement_sparsity/model_sparsity": 0.3956080152177026, + "compression_loss": 86.68151092529297, + "distillation_loss": 1.3980646133422852, + "epoch": 1.23, + "learning_rate": 4.386306001690617e-05, + "loss": 88.3813, + "step": 1452, + "task_loss": 2.2090260982513428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8076918661880335, + "compression/movement_sparsity/importance_threshold": -0.0021738679614904184, + "compression/movement_sparsity/linear_layer_sparsity": 0.4115200170525137, + "compression/movement_sparsity/model_sparsity": 0.3973830384831101, + "compression_loss": 86.92528533935547, + "distillation_loss": 1.981194257736206, + "epoch": 1.23, + "learning_rate": 4.3858833474218095e-05, + "loss": 88.822, + "step": 1453, + "task_loss": 1.2608253955841064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8099592491012688, + "compression/movement_sparsity/importance_threshold": -0.002166748312260858, + "compression/movement_sparsity/linear_layer_sparsity": 0.41346941575012747, + "compression/movement_sparsity/model_sparsity": 0.39926546933840895, + "compression_loss": 87.16854858398438, + "distillation_loss": 1.5396332740783691, + "epoch": 1.23, + "learning_rate": 4.3854606931530015e-05, + "loss": 88.7082, + "step": 1454, + "task_loss": 1.3598216772079468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8122216759882535, + "compression/movement_sparsity/importance_threshold": -0.002159644225099914, + "compression/movement_sparsity/linear_layer_sparsity": 0.41532618751354583, + "compression/movement_sparsity/model_sparsity": 0.4010584552796559, + "compression_loss": 87.41122436523438, + "distillation_loss": 3.2592477798461914, + "epoch": 1.23, + "learning_rate": 4.385038038884193e-05, + "loss": 89.797, + "step": 1455, + "task_loss": 1.6531397104263306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8144791522713364, + "compression/movement_sparsity/importance_threshold": -0.0021525556829812523, + "compression/movement_sparsity/linear_layer_sparsity": 0.4172355448562384, + "compression/movement_sparsity/model_sparsity": 0.4029022203237567, + "compression_loss": 87.65345764160156, + "distillation_loss": 1.3059204816818237, + "epoch": 1.23, + "learning_rate": 4.384615384615385e-05, + "loss": 89.4426, + "step": 1456, + "task_loss": 0.5418781042098999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8167316833728653, + "compression/movement_sparsity/importance_threshold": -0.00214548266887854, + "compression/movement_sparsity/linear_layer_sparsity": 0.4190575934435012, + "compression/movement_sparsity/model_sparsity": 0.40466167593676994, + "compression_loss": 87.89512634277344, + "distillation_loss": 1.720576286315918, + "epoch": 1.23, + "learning_rate": 4.384192730346577e-05, + "loss": 89.5843, + "step": 1457, + "task_loss": 0.6119003295898438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8189792747151883, + "compression/movement_sparsity/importance_threshold": -0.0021384251657654446, + "compression/movement_sparsity/linear_layer_sparsity": 0.4210763431000852, + "compression/movement_sparsity/model_sparsity": 0.40661107533224994, + "compression_loss": 88.13621520996094, + "distillation_loss": 2.311744451522827, + "epoch": 1.23, + "learning_rate": 4.383770076077768e-05, + "loss": 89.8228, + "step": 1458, + "task_loss": 1.4281361103057861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.821221931720654, + "compression/movement_sparsity/importance_threshold": -0.002131383156615631, + "compression/movement_sparsity/linear_layer_sparsity": 0.4230505202180463, + "compression/movement_sparsity/model_sparsity": 0.4085174333929299, + "compression_loss": 88.37676239013672, + "distillation_loss": 1.977921724319458, + "epoch": 1.23, + "learning_rate": 4.383347421808961e-05, + "loss": 90.2036, + "step": 1459, + "task_loss": 1.280239462852478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8234596598116104, + "compression/movement_sparsity/importance_threshold": -0.0021243566244027678, + "compression/movement_sparsity/linear_layer_sparsity": 0.42487977100256114, + "compression/movement_sparsity/model_sparsity": 0.410283843785563, + "compression_loss": 88.61685943603516, + "distillation_loss": 1.7213938236236572, + "epoch": 1.23, + "learning_rate": 4.3829247675401526e-05, + "loss": 91.0051, + "step": 1460, + "task_loss": 2.0139009952545166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8256924644104057, + "compression/movement_sparsity/importance_threshold": -0.0021173455521005208, + "compression/movement_sparsity/linear_layer_sparsity": 0.42689720900070516, + "compression/movement_sparsity/model_sparsity": 0.4122319765821056, + "compression_loss": 88.8563461303711, + "distillation_loss": 1.3316011428833008, + "epoch": 1.23, + "learning_rate": 4.3825021132713446e-05, + "loss": 90.1861, + "step": 1461, + "task_loss": 1.7979121208190918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8279203509393878, + "compression/movement_sparsity/importance_threshold": -0.002110349922682557, + "compression/movement_sparsity/linear_layer_sparsity": 0.42897252690855364, + "compression/movement_sparsity/model_sparsity": 0.4142360009353948, + "compression_loss": 89.09526062011719, + "distillation_loss": 2.021014451980591, + "epoch": 1.24, + "learning_rate": 4.382079459002536e-05, + "loss": 90.791, + "step": 1462, + "task_loss": 1.4371663331985474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8301433248209054, + "compression/movement_sparsity/importance_threshold": -0.002103369719122543, + "compression/movement_sparsity/linear_layer_sparsity": 0.43085194266631255, + "compression/movement_sparsity/model_sparsity": 0.41605085298011557, + "compression_loss": 89.33366394042969, + "distillation_loss": 2.145749807357788, + "epoch": 1.24, + "learning_rate": 4.381656804733728e-05, + "loss": 91.4833, + "step": 1463, + "task_loss": 1.613084316253662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8323613914773064, + "compression/movement_sparsity/importance_threshold": -0.0020964049243941455, + "compression/movement_sparsity/linear_layer_sparsity": 0.43277345073582607, + "compression/movement_sparsity/model_sparsity": 0.4179063513361911, + "compression_loss": 89.57154083251953, + "distillation_loss": 1.9125415086746216, + "epoch": 1.24, + "learning_rate": 4.38123415046492e-05, + "loss": 91.5646, + "step": 1464, + "task_loss": 1.4451509714126587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8345745563309392, + "compression/movement_sparsity/importance_threshold": -0.002089455521471031, + "compression/movement_sparsity/linear_layer_sparsity": 0.43474957149311183, + "compression/movement_sparsity/model_sparsity": 0.41981458626620555, + "compression_loss": 89.80891418457031, + "distillation_loss": 1.868727445602417, + "epoch": 1.24, + "learning_rate": 4.380811496196112e-05, + "loss": 91.9819, + "step": 1465, + "task_loss": 1.6708552837371826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8367828248041516, + "compression/movement_sparsity/importance_threshold": -0.0020825214933268667, + "compression/movement_sparsity/linear_layer_sparsity": 0.4366693267099829, + "compression/movement_sparsity/model_sparsity": 0.42166839198551925, + "compression_loss": 90.04573822021484, + "distillation_loss": 2.5361168384552, + "epoch": 1.24, + "learning_rate": 4.380388841927304e-05, + "loss": 91.8456, + "step": 1466, + "task_loss": 1.4672216176986694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8389862023192922, + "compression/movement_sparsity/importance_threshold": -0.002075602822935319, + "compression/movement_sparsity/linear_layer_sparsity": 0.43848878775286865, + "compression/movement_sparsity/model_sparsity": 0.42342534894426503, + "compression_loss": 90.28205108642578, + "distillation_loss": 0.8657584190368652, + "epoch": 1.24, + "learning_rate": 4.379966187658496e-05, + "loss": 91.6539, + "step": 1467, + "task_loss": 0.7657065391540527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8411846942987089, + "compression/movement_sparsity/importance_threshold": -0.0020686994932700552, + "compression/movement_sparsity/linear_layer_sparsity": 0.44055661728344186, + "compression/movement_sparsity/model_sparsity": 0.4254221421690753, + "compression_loss": 90.51786804199219, + "distillation_loss": 1.1345958709716797, + "epoch": 1.24, + "learning_rate": 4.379543533389687e-05, + "loss": 91.8193, + "step": 1468, + "task_loss": 1.4328489303588867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.84337830616475, + "compression/movement_sparsity/importance_threshold": -0.0020618114873047415, + "compression/movement_sparsity/linear_layer_sparsity": 0.44240591259375256, + "compression/movement_sparsity/model_sparsity": 0.4272079084963791, + "compression_loss": 90.7530517578125, + "distillation_loss": 2.471135377883911, + "epoch": 1.24, + "learning_rate": 4.379120879120879e-05, + "loss": 92.5652, + "step": 1469, + "task_loss": 1.656062126159668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.845567043339764, + "compression/movement_sparsity/importance_threshold": -0.0020549387880130442, + "compression/movement_sparsity/linear_layer_sparsity": 0.44437739484982797, + "compression/movement_sparsity/model_sparsity": 0.42911166427196945, + "compression_loss": 90.98782348632812, + "distillation_loss": 1.2275235652923584, + "epoch": 1.24, + "learning_rate": 4.378698224852072e-05, + "loss": 92.1805, + "step": 1470, + "task_loss": 0.8283646106719971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8477509112460988, + "compression/movement_sparsity/importance_threshold": -0.00204808137836863, + "compression/movement_sparsity/linear_layer_sparsity": 0.4463222504395725, + "compression/movement_sparsity/model_sparsity": 0.4309897080891307, + "compression_loss": 91.22193145751953, + "distillation_loss": 1.943591833114624, + "epoch": 1.24, + "learning_rate": 4.378275570583263e-05, + "loss": 93.0569, + "step": 1471, + "task_loss": 1.2804784774780273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8499299153061023, + "compression/movement_sparsity/importance_threshold": -0.0020412392413451676, + "compression/movement_sparsity/linear_layer_sparsity": 0.4482376294869212, + "compression/movement_sparsity/model_sparsity": 0.4328392879738078, + "compression_loss": 91.45556640625, + "distillation_loss": 1.5874154567718506, + "epoch": 1.24, + "learning_rate": 4.377852916314455e-05, + "loss": 94.0066, + "step": 1472, + "task_loss": 0.7793343663215637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8521040609421231, + "compression/movement_sparsity/importance_threshold": -0.0020344123599163213, + "compression/movement_sparsity/linear_layer_sparsity": 0.4503339220056412, + "compression/movement_sparsity/model_sparsity": 0.4348635663955596, + "compression_loss": 91.68872833251953, + "distillation_loss": 2.8074960708618164, + "epoch": 1.24, + "learning_rate": 4.377430262045647e-05, + "loss": 93.4532, + "step": 1473, + "task_loss": 1.2144739627838135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8542733535765097, + "compression/movement_sparsity/importance_threshold": -0.0020276007170557586, + "compression/movement_sparsity/linear_layer_sparsity": 0.452360601233703, + "compression/movement_sparsity/model_sparsity": 0.4368206229573429, + "compression_loss": 91.9212875366211, + "distillation_loss": 2.3325397968292236, + "epoch": 1.25, + "learning_rate": 4.377007607776838e-05, + "loss": 94.07, + "step": 1474, + "task_loss": 1.2730668783187866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8564377986316093, + "compression/movement_sparsity/importance_threshold": -0.0020208042957371476, + "compression/movement_sparsity/linear_layer_sparsity": 0.4541298496066742, + "compression/movement_sparsity/model_sparsity": 0.43852909220585795, + "compression_loss": 92.15338134765625, + "distillation_loss": 2.2134971618652344, + "epoch": 1.25, + "learning_rate": 4.37658495350803e-05, + "loss": 93.9278, + "step": 1475, + "task_loss": 1.9229933023452759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.858597401529771, + "compression/movement_sparsity/importance_threshold": -0.002014023078934153, + "compression/movement_sparsity/linear_layer_sparsity": 0.4562388890605969, + "compression/movement_sparsity/model_sparsity": 0.44056567966637417, + "compression_loss": 92.38483428955078, + "distillation_loss": 2.442070245742798, + "epoch": 1.25, + "learning_rate": 4.376162299239223e-05, + "loss": 94.9096, + "step": 1476, + "task_loss": 2.572432518005371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.860752167693343, + "compression/movement_sparsity/importance_threshold": -0.002007257049620441, + "compression/movement_sparsity/linear_layer_sparsity": 0.4579981807535921, + "compression/movement_sparsity/model_sparsity": 0.44226453427750073, + "compression_loss": 92.61590576171875, + "distillation_loss": 2.8263728618621826, + "epoch": 1.25, + "learning_rate": 4.375739644970415e-05, + "loss": 94.9327, + "step": 1477, + "task_loss": 2.4674713611602783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8629021025446726, + "compression/movement_sparsity/importance_threshold": -0.0020005061907696817, + "compression/movement_sparsity/linear_layer_sparsity": 0.45975876025669205, + "compression/movement_sparsity/model_sparsity": 0.44396463245849305, + "compression_loss": 92.84642028808594, + "distillation_loss": 1.3342137336730957, + "epoch": 1.25, + "learning_rate": 4.375316990701606e-05, + "loss": 94.4951, + "step": 1478, + "task_loss": 0.7381772398948669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.865047211506109, + "compression/movement_sparsity/importance_threshold": -0.001993770485355539, + "compression/movement_sparsity/linear_layer_sparsity": 0.46180760651238895, + "compression/movement_sparsity/model_sparsity": 0.44594309454231845, + "compression_loss": 93.07637023925781, + "distillation_loss": 1.8354148864746094, + "epoch": 1.25, + "learning_rate": 4.374894336432798e-05, + "loss": 94.6196, + "step": 1479, + "task_loss": 1.4156618118286133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8671875, + "compression/movement_sparsity/importance_threshold": -0.0019870499163516797, + "compression/movement_sparsity/linear_layer_sparsity": 0.4635489165605893, + "compression/movement_sparsity/model_sparsity": 0.4476245852334668, + "compression_loss": 93.30579376220703, + "distillation_loss": 0.9454125761985779, + "epoch": 1.25, + "learning_rate": 4.37447168216399e-05, + "loss": 94.7726, + "step": 1480, + "task_loss": 0.47018828988075256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8693229734486936, + "compression/movement_sparsity/importance_threshold": -0.001980344466731772, + "compression/movement_sparsity/linear_layer_sparsity": 0.46562029949311795, + "compression/movement_sparsity/model_sparsity": 0.4496248097899439, + "compression_loss": 93.53468322753906, + "distillation_loss": 2.756983757019043, + "epoch": 1.25, + "learning_rate": 4.374049027895182e-05, + "loss": 95.5078, + "step": 1481, + "task_loss": 2.7871975898742676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8714536372745385, + "compression/movement_sparsity/importance_threshold": -0.0019736541194694808, + "compression/movement_sparsity/linear_layer_sparsity": 0.4675978153780171, + "compression/movement_sparsity/model_sparsity": 0.4515343919206463, + "compression_loss": 93.76298522949219, + "distillation_loss": 2.8619112968444824, + "epoch": 1.25, + "learning_rate": 4.373626373626374e-05, + "loss": 95.9601, + "step": 1482, + "task_loss": 2.1617841720581055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.873579496899882, + "compression/movement_sparsity/importance_threshold": -0.0019669788575384754, + "compression/movement_sparsity/linear_layer_sparsity": 0.469544721924595, + "compression/movement_sparsity/model_sparsity": 0.4534144162379642, + "compression_loss": 93.99088287353516, + "distillation_loss": 1.434727430343628, + "epoch": 1.25, + "learning_rate": 4.373203719357566e-05, + "loss": 95.6824, + "step": 1483, + "task_loss": 1.8588767051696777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8757005577470733, + "compression/movement_sparsity/importance_threshold": -0.00196031866391242, + "compression/movement_sparsity/linear_layer_sparsity": 0.4713988584469659, + "compression/movement_sparsity/model_sparsity": 0.4552048574668005, + "compression_loss": 94.21818542480469, + "distillation_loss": 1.7201147079467773, + "epoch": 1.25, + "learning_rate": 4.372781065088757e-05, + "loss": 96.1279, + "step": 1484, + "task_loss": 0.8511843085289001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.87781682523846, + "compression/movement_sparsity/importance_threshold": -0.0019536735215649826, + "compression/movement_sparsity/linear_layer_sparsity": 0.4732601136974154, + "compression/movement_sparsity/model_sparsity": 0.4570021728735062, + "compression_loss": 94.44505310058594, + "distillation_loss": 1.5092852115631104, + "epoch": 1.26, + "learning_rate": 4.372358410819949e-05, + "loss": 96.9364, + "step": 1485, + "task_loss": 1.259634256362915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8799283047963904, + "compression/movement_sparsity/importance_threshold": -0.0019470434134698299, + "compression/movement_sparsity/linear_layer_sparsity": 0.4750041901525072, + "compression/movement_sparsity/model_sparsity": 0.4586863349369589, + "compression_loss": 94.6713638305664, + "distillation_loss": 1.6528480052947998, + "epoch": 1.26, + "learning_rate": 4.371935756551141e-05, + "loss": 96.3093, + "step": 1486, + "task_loss": 1.1547406911849976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8820350018432129, + "compression/movement_sparsity/importance_threshold": -0.0019404283226006278, + "compression/movement_sparsity/linear_layer_sparsity": 0.4768658031279858, + "compression/movement_sparsity/model_sparsity": 0.4604839957797384, + "compression_loss": 94.89723205566406, + "distillation_loss": 2.292375087738037, + "epoch": 1.26, + "learning_rate": 4.371513102282333e-05, + "loss": 96.9592, + "step": 1487, + "task_loss": 0.9253711104393005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8841369218012756, + "compression/movement_sparsity/importance_threshold": -0.0019338282319310432, + "compression/movement_sparsity/linear_layer_sparsity": 0.47882526582308427, + "compression/movement_sparsity/model_sparsity": 0.4623761449032479, + "compression_loss": 95.12249755859375, + "distillation_loss": 3.1671009063720703, + "epoch": 1.26, + "learning_rate": 4.371090448013525e-05, + "loss": 97.1826, + "step": 1488, + "task_loss": 1.5423450469970703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8862340700929262, + "compression/movement_sparsity/importance_threshold": -0.0019272431244347446, + "compression/movement_sparsity/linear_layer_sparsity": 0.4806814175297856, + "compression/movement_sparsity/model_sparsity": 0.4641685320886335, + "compression_loss": 95.34727478027344, + "distillation_loss": 2.4407575130462646, + "epoch": 1.26, + "learning_rate": 4.370667793744717e-05, + "loss": 97.8708, + "step": 1489, + "task_loss": 1.1997573375701904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8883264521405138, + "compression/movement_sparsity/importance_threshold": -0.001920672983085396, + "compression/movement_sparsity/linear_layer_sparsity": 0.4825638143294535, + "compression/movement_sparsity/model_sparsity": 0.4659862627673029, + "compression_loss": 95.57157897949219, + "distillation_loss": 2.453951358795166, + "epoch": 1.26, + "learning_rate": 4.370245139475909e-05, + "loss": 97.779, + "step": 1490, + "task_loss": 1.8541613817214966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8904140733663858, + "compression/movement_sparsity/importance_threshold": -0.0019141177908566665, + "compression/movement_sparsity/linear_layer_sparsity": 0.4842969920953262, + "compression/movement_sparsity/model_sparsity": 0.4676599005450394, + "compression_loss": 95.79537200927734, + "distillation_loss": 2.0961103439331055, + "epoch": 1.26, + "learning_rate": 4.3698224852071004e-05, + "loss": 98.0075, + "step": 1491, + "task_loss": 1.0674598217010498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8924969391928907, + "compression/movement_sparsity/importance_threshold": -0.0019075775307222215, + "compression/movement_sparsity/linear_layer_sparsity": 0.48607699606750493, + "compression/movement_sparsity/model_sparsity": 0.46937875590484107, + "compression_loss": 96.01873016357422, + "distillation_loss": 2.028595209121704, + "epoch": 1.26, + "learning_rate": 4.3693998309382924e-05, + "loss": 98.1846, + "step": 1492, + "task_loss": 1.8315556049346924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8945750550423769, + "compression/movement_sparsity/importance_threshold": -0.001901052185655728, + "compression/movement_sparsity/linear_layer_sparsity": 0.4879725452240751, + "compression/movement_sparsity/model_sparsity": 0.47120918711649185, + "compression_loss": 96.24153137207031, + "distillation_loss": 2.286721706390381, + "epoch": 1.26, + "learning_rate": 4.368977176669485e-05, + "loss": 98.6716, + "step": 1493, + "task_loss": 2.710057020187378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8966484263371924, + "compression/movement_sparsity/importance_threshold": -0.0018945417386308522, + "compression/movement_sparsity/linear_layer_sparsity": 0.4897690760925972, + "compression/movement_sparsity/model_sparsity": 0.47294400162290473, + "compression_loss": 96.46377563476562, + "distillation_loss": 2.9375553131103516, + "epoch": 1.26, + "learning_rate": 4.3685545224006764e-05, + "loss": 99.0742, + "step": 1494, + "task_loss": 2.3950765132904053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8987170584996854, + "compression/movement_sparsity/importance_threshold": -0.0018880461726212615, + "compression/movement_sparsity/linear_layer_sparsity": 0.4915393976406556, + "compression/movement_sparsity/model_sparsity": 0.4746535071796412, + "compression_loss": 96.68563842773438, + "distillation_loss": 1.6906776428222656, + "epoch": 1.26, + "learning_rate": 4.3681318681318683e-05, + "loss": 98.3729, + "step": 1495, + "task_loss": 1.2968608140945435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9007809569522041, + "compression/movement_sparsity/importance_threshold": -0.0018815654706006227, + "compression/movement_sparsity/linear_layer_sparsity": 0.49322533185435496, + "compression/movement_sparsity/model_sparsity": 0.47628152436655974, + "compression_loss": 96.9068603515625, + "distillation_loss": 2.6485891342163086, + "epoch": 1.26, + "learning_rate": 4.36770921386306e-05, + "loss": 99.156, + "step": 1496, + "task_loss": 1.5425375699996948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9028401271170969, + "compression/movement_sparsity/importance_threshold": -0.0018750996155426011, + "compression/movement_sparsity/linear_layer_sparsity": 0.4950823182527908, + "compression/movement_sparsity/model_sparsity": 0.47807471756945097, + "compression_loss": 97.12759399414062, + "distillation_loss": 2.005417823791504, + "epoch": 1.27, + "learning_rate": 4.3672865595942516e-05, + "loss": 99.6693, + "step": 1497, + "task_loss": 0.7923761606216431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9048945744167113, + "compression/movement_sparsity/importance_threshold": -0.0018686485904208664, + "compression/movement_sparsity/linear_layer_sparsity": 0.4969240774891557, + "compression/movement_sparsity/model_sparsity": 0.4798532067101326, + "compression_loss": 97.34791564941406, + "distillation_loss": 2.7149128913879395, + "epoch": 1.27, + "learning_rate": 4.366863905325444e-05, + "loss": 100.0436, + "step": 1498, + "task_loss": 1.74092435836792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9069443042733967, + "compression/movement_sparsity/importance_threshold": -0.0018622123782090815, + "compression/movement_sparsity/linear_layer_sparsity": 0.49890451479512565, + "compression/movement_sparsity/model_sparsity": 0.4817656099021047, + "compression_loss": 97.56766510009766, + "distillation_loss": 3.3202428817749023, + "epoch": 1.27, + "learning_rate": 4.366441251056636e-05, + "loss": 100.1968, + "step": 1499, + "task_loss": 1.7851262092590332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9089893221095, + "compression/movement_sparsity/importance_threshold": -0.0018557909618809165, + "compression/movement_sparsity/linear_layer_sparsity": 0.5006381456793685, + "compression/movement_sparsity/model_sparsity": 0.4834396852322014, + "compression_loss": 97.78693389892578, + "distillation_loss": 2.261439800262451, + "epoch": 1.27, + "learning_rate": 4.3660185967878275e-05, + "loss": 100.1552, + "step": 1500, + "task_loss": 2.0748744010925293 + }, + { + "epoch": 1.27, + "eval_accuracy": 0.7445148514851485, + "eval_loss": 99.59747314453125, + "eval_runtime": 376.0733, + "eval_samples_per_second": 67.141, + "eval_steps_per_second": 0.526, + "step": 1500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9110296333473702, + "compression/movement_sparsity/importance_threshold": -0.0018493843244100365, + "compression/movement_sparsity/linear_layer_sparsity": 0.5023965669081263, + "compression/movement_sparsity/model_sparsity": 0.48513769928221495, + "compression_loss": 98.0056381225586, + "distillation_loss": 2.453676700592041, + "epoch": 1.27, + "learning_rate": 4.3655959425190195e-05, + "loss": 100.7752, + "step": 1501, + "task_loss": 1.9804919958114624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9130652434093551, + "compression/movement_sparsity/importance_threshold": -0.0018429924487701086, + "compression/movement_sparsity/linear_layer_sparsity": 0.5041724093458001, + "compression/movement_sparsity/model_sparsity": 0.4868525360690243, + "compression_loss": 98.22395324707031, + "distillation_loss": 3.336181163787842, + "epoch": 1.27, + "learning_rate": 4.3651732882502115e-05, + "loss": 100.9461, + "step": 1502, + "task_loss": 1.8443694114685059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9150961577178031, + "compression/movement_sparsity/importance_threshold": -0.0018366153179347998, + "compression/movement_sparsity/linear_layer_sparsity": 0.5058707566180084, + "compression/movement_sparsity/model_sparsity": 0.48849253988770497, + "compression_loss": 98.44169616699219, + "distillation_loss": 1.8525476455688477, + "epoch": 1.27, + "learning_rate": 4.3647506339814035e-05, + "loss": 100.3817, + "step": 1503, + "task_loss": 0.7774598002433777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9171223816950625, + "compression/movement_sparsity/importance_threshold": -0.0018302529148777756, + "compression/movement_sparsity/linear_layer_sparsity": 0.5078079568921308, + "compression/movement_sparsity/model_sparsity": 0.49036319137288603, + "compression_loss": 98.65892791748047, + "distillation_loss": 2.2874534130096436, + "epoch": 1.27, + "learning_rate": 4.3643279797125954e-05, + "loss": 101.0286, + "step": 1504, + "task_loss": 0.7870293259620667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9191439207634815, + "compression/movement_sparsity/importance_threshold": -0.0018239052225727031, + "compression/movement_sparsity/linear_layer_sparsity": 0.5097417468543092, + "compression/movement_sparsity/model_sparsity": 0.4922305497008299, + "compression_loss": 98.87566375732422, + "distillation_loss": 3.596014976501465, + "epoch": 1.27, + "learning_rate": 4.3639053254437874e-05, + "loss": 101.5716, + "step": 1505, + "task_loss": 1.4280626773834229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9211607803454076, + "compression/movement_sparsity/importance_threshold": -0.0018175722239932514, + "compression/movement_sparsity/linear_layer_sparsity": 0.5114834146275387, + "compression/movement_sparsity/model_sparsity": 0.49391238582805214, + "compression_loss": 99.09185028076172, + "distillation_loss": 1.1857656240463257, + "epoch": 1.27, + "learning_rate": 4.3634826711749794e-05, + "loss": 101.5815, + "step": 1506, + "task_loss": 1.2029460668563843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9231729658631901, + "compression/movement_sparsity/importance_threshold": -0.0018112539021130835, + "compression/movement_sparsity/linear_layer_sparsity": 0.5132793731360141, + "compression/movement_sparsity/model_sparsity": 0.4956466476367469, + "compression_loss": 99.30754089355469, + "distillation_loss": 3.269878387451172, + "epoch": 1.27, + "learning_rate": 4.363060016906171e-05, + "loss": 101.7911, + "step": 1507, + "task_loss": 1.5143624544143677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9251804827391765, + "compression/movement_sparsity/importance_threshold": -0.0018049502399058682, + "compression/movement_sparsity/linear_layer_sparsity": 0.5150177259906408, + "compression/movement_sparsity/model_sparsity": 0.49732528272301824, + "compression_loss": 99.52271270751953, + "distillation_loss": 4.235038757324219, + "epoch": 1.27, + "learning_rate": 4.3626373626373626e-05, + "loss": 102.5151, + "step": 1508, + "task_loss": 3.344265937805176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9271833363957147, + "compression/movement_sparsity/importance_threshold": -0.0017986612203452733, + "compression/movement_sparsity/linear_layer_sparsity": 0.5166853685311867, + "compression/movement_sparsity/model_sparsity": 0.49893563661202794, + "compression_loss": 99.73743438720703, + "distillation_loss": 2.83672833442688, + "epoch": 1.28, + "learning_rate": 4.3622147083685546e-05, + "loss": 102.6316, + "step": 1509, + "task_loss": 1.4607300758361816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9291815322551539, + "compression/movement_sparsity/importance_threshold": -0.0017923868264049627, + "compression/movement_sparsity/linear_layer_sparsity": 0.518119845897781, + "compression/movement_sparsity/model_sparsity": 0.5003208352681112, + "compression_loss": 99.95162200927734, + "distillation_loss": 2.564958095550537, + "epoch": 1.28, + "learning_rate": 4.3617920540997466e-05, + "loss": 102.647, + "step": 1510, + "task_loss": 2.469444990158081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9311750757398416, + "compression/movement_sparsity/importance_threshold": -0.0017861270410586046, + "compression/movement_sparsity/linear_layer_sparsity": 0.5200264487577497, + "compression/movement_sparsity/model_sparsity": 0.5021619404544435, + "compression_loss": 100.16531372070312, + "distillation_loss": 2.4818615913391113, + "epoch": 1.28, + "learning_rate": 4.3613693998309386e-05, + "loss": 102.6456, + "step": 1511, + "task_loss": 1.5539356470108032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9331639722721259, + "compression/movement_sparsity/importance_threshold": -0.0017798818472798666, + "compression/movement_sparsity/linear_layer_sparsity": 0.5218575000915776, + "compression/movement_sparsity/model_sparsity": 0.5039300895419816, + "compression_loss": 100.37849426269531, + "distillation_loss": 2.1918630599975586, + "epoch": 1.28, + "learning_rate": 4.3609467455621305e-05, + "loss": 102.4499, + "step": 1512, + "task_loss": 1.3998414278030396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9351482272743554, + "compression/movement_sparsity/importance_threshold": -0.0017736512280424137, + "compression/movement_sparsity/linear_layer_sparsity": 0.5234306436036094, + "compression/movement_sparsity/model_sparsity": 0.5054491907348195, + "compression_loss": 100.59117889404297, + "distillation_loss": 2.5820369720458984, + "epoch": 1.28, + "learning_rate": 4.360524091293322e-05, + "loss": 103.1039, + "step": 1513, + "task_loss": 1.0723028182983398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9371278461688781, + "compression/movement_sparsity/importance_threshold": -0.001767435166319914, + "compression/movement_sparsity/linear_layer_sparsity": 0.5252309663574396, + "compression/movement_sparsity/model_sparsity": 0.5071876668636152, + "compression_loss": 100.80329895019531, + "distillation_loss": 4.318636417388916, + "epoch": 1.28, + "learning_rate": 4.360101437024514e-05, + "loss": 104.475, + "step": 1514, + "task_loss": 1.7804205417633057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9391028343780421, + "compression/movement_sparsity/importance_threshold": -0.0017612336450860337, + "compression/movement_sparsity/linear_layer_sparsity": 0.5269607457355361, + "compression/movement_sparsity/model_sparsity": 0.5088580229986502, + "compression_loss": 101.01500701904297, + "distillation_loss": 3.642416000366211, + "epoch": 1.28, + "learning_rate": 4.3596787827557065e-05, + "loss": 103.5358, + "step": 1515, + "task_loss": 1.6169562339782715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9410731973241961, + "compression/movement_sparsity/importance_threshold": -0.0017550466473144383, + "compression/movement_sparsity/linear_layer_sparsity": 0.5290192744153533, + "compression/movement_sparsity/model_sparsity": 0.5108458348855407, + "compression_loss": 101.22613525390625, + "distillation_loss": 3.3107755184173584, + "epoch": 1.28, + "learning_rate": 4.359256128486898e-05, + "loss": 104.0058, + "step": 1516, + "task_loss": 2.4476919174194336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9430389404296875, + "compression/movement_sparsity/importance_threshold": -0.0017488741559787968, + "compression/movement_sparsity/linear_layer_sparsity": 0.5309074544363246, + "compression/movement_sparsity/model_sparsity": 0.5126691501140704, + "compression_loss": 101.43675231933594, + "distillation_loss": 1.8777174949645996, + "epoch": 1.28, + "learning_rate": 4.35883347421809e-05, + "loss": 103.4714, + "step": 1517, + "task_loss": 0.9147438406944275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9450000691168647, + "compression/movement_sparsity/importance_threshold": -0.0017427161540527754, + "compression/movement_sparsity/linear_layer_sparsity": 0.5325703630823191, + "compression/movement_sparsity/model_sparsity": 0.5142749327323698, + "compression_loss": 101.64681243896484, + "distillation_loss": 3.4031283855438232, + "epoch": 1.28, + "learning_rate": 4.358410819949282e-05, + "loss": 104.4431, + "step": 1518, + "task_loss": 1.4187113046646118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9469565888080768, + "compression/movement_sparsity/importance_threshold": -0.0017365726245100383, + "compression/movement_sparsity/linear_layer_sparsity": 0.5342949792958293, + "compression/movement_sparsity/model_sparsity": 0.5159403030734058, + "compression_loss": 101.85639953613281, + "distillation_loss": 3.1535377502441406, + "epoch": 1.28, + "learning_rate": 4.357988165680474e-05, + "loss": 104.5497, + "step": 1519, + "task_loss": 1.2348978519439697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9489085049256708, + "compression/movement_sparsity/importance_threshold": -0.0017304435503242557, + "compression/movement_sparsity/linear_layer_sparsity": 0.5360750071163433, + "compression/movement_sparsity/model_sparsity": 0.5176591814622791, + "compression_loss": 102.06546783447266, + "distillation_loss": 2.6222758293151855, + "epoch": 1.28, + "learning_rate": 4.3575655114116657e-05, + "loss": 105.0114, + "step": 1520, + "task_loss": 2.132873773574829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9508558228919954, + "compression/movement_sparsity/importance_threshold": -0.0017243289144690923, + "compression/movement_sparsity/linear_layer_sparsity": 0.5379344141208092, + "compression/movement_sparsity/model_sparsity": 0.5194547121159365, + "compression_loss": 102.27413940429688, + "distillation_loss": 3.047675848007202, + "epoch": 1.29, + "learning_rate": 4.3571428571428576e-05, + "loss": 104.9919, + "step": 1521, + "task_loss": 2.442349433898926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9527985481293989, + "compression/movement_sparsity/importance_threshold": -0.0017182286999182154, + "compression/movement_sparsity/linear_layer_sparsity": 0.5396139569806557, + "compression/movement_sparsity/model_sparsity": 0.5210765575116693, + "compression_loss": 102.4822998046875, + "distillation_loss": 2.3306033611297607, + "epoch": 1.29, + "learning_rate": 4.3567202028740496e-05, + "loss": 105.1335, + "step": 1522, + "task_loss": 1.3816114664077759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9547366860602291, + "compression/movement_sparsity/importance_threshold": -0.0017121428896452922, + "compression/movement_sparsity/linear_layer_sparsity": 0.5412530769122168, + "compression/movement_sparsity/model_sparsity": 0.5226593686310583, + "compression_loss": 102.68992614746094, + "distillation_loss": 2.8695499897003174, + "epoch": 1.29, + "learning_rate": 4.356297548605241e-05, + "loss": 105.2621, + "step": 1523, + "task_loss": 1.7741518020629883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9566702421068347, + "compression/movement_sparsity/importance_threshold": -0.0017060714666239881, + "compression/movement_sparsity/linear_layer_sparsity": 0.5430503351549646, + "compression/movement_sparsity/model_sparsity": 0.5243948855241547, + "compression_loss": 102.89703369140625, + "distillation_loss": 2.940119504928589, + "epoch": 1.29, + "learning_rate": 4.355874894336433e-05, + "loss": 105.4771, + "step": 1524, + "task_loss": 1.316749930381775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9585992216915636, + "compression/movement_sparsity/importance_threshold": -0.0017000144138279707, + "compression/movement_sparsity/linear_layer_sparsity": 0.5446679558122781, + "compression/movement_sparsity/model_sparsity": 0.5259569359355063, + "compression_loss": 103.10370635986328, + "distillation_loss": 2.3594701290130615, + "epoch": 1.29, + "learning_rate": 4.355452240067625e-05, + "loss": 105.4503, + "step": 1525, + "task_loss": 1.0633206367492676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9605236302367641, + "compression/movement_sparsity/importance_threshold": -0.001693971714230907, + "compression/movement_sparsity/linear_layer_sparsity": 0.5464578926160976, + "compression/movement_sparsity/model_sparsity": 0.5276853829036249, + "compression_loss": 103.30982971191406, + "distillation_loss": 2.6086926460266113, + "epoch": 1.29, + "learning_rate": 4.355029585798817e-05, + "loss": 105.8107, + "step": 1526, + "task_loss": 1.6839449405670166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9624434731647843, + "compression/movement_sparsity/importance_threshold": -0.0016879433508064628, + "compression/movement_sparsity/linear_layer_sparsity": 0.5484329640466313, + "compression/movement_sparsity/model_sparsity": 0.5295926045544894, + "compression_loss": 103.51546478271484, + "distillation_loss": 3.4850447177886963, + "epoch": 1.29, + "learning_rate": 4.354606931530009e-05, + "loss": 106.2459, + "step": 1527, + "task_loss": 1.6158456802368164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9643587558979725, + "compression/movement_sparsity/importance_threshold": -0.0016819293065283053, + "compression/movement_sparsity/linear_layer_sparsity": 0.5500547343142821, + "compression/movement_sparsity/model_sparsity": 0.5311586620242975, + "compression_loss": 103.72052764892578, + "distillation_loss": 3.682271718978882, + "epoch": 1.29, + "learning_rate": 4.354184277261201e-05, + "loss": 107.3768, + "step": 1528, + "task_loss": 1.9275528192520142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9662694838586766, + "compression/movement_sparsity/importance_threshold": -0.0016759295643701023, + "compression/movement_sparsity/linear_layer_sparsity": 0.5517485861752918, + "compression/movement_sparsity/model_sparsity": 0.5327943248629836, + "compression_loss": 103.92520141601562, + "distillation_loss": 2.254791259765625, + "epoch": 1.29, + "learning_rate": 4.353761622992392e-05, + "loss": 106.4485, + "step": 1529, + "task_loss": 2.2280333042144775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9681756624692455, + "compression/movement_sparsity/importance_threshold": -0.0016699441073055182, + "compression/movement_sparsity/linear_layer_sparsity": 0.5536855837385642, + "compression/movement_sparsity/model_sparsity": 0.5346647806010563, + "compression_loss": 104.12935638427734, + "distillation_loss": 3.466214656829834, + "epoch": 1.29, + "learning_rate": 4.353338968723584e-05, + "loss": 106.8591, + "step": 1530, + "task_loss": 1.6891735792160034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9700772971520265, + "compression/movement_sparsity/importance_threshold": -0.001663972918308222, + "compression/movement_sparsity/linear_layer_sparsity": 0.5556070918080778, + "compression/movement_sparsity/model_sparsity": 0.5365202789571317, + "compression_loss": 104.33299255371094, + "distillation_loss": 2.9059624671936035, + "epoch": 1.29, + "learning_rate": 4.352916314454776e-05, + "loss": 107.7146, + "step": 1531, + "task_loss": 1.1788159608840942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9719743933293683, + "compression/movement_sparsity/importance_threshold": -0.0016580159803518792, + "compression/movement_sparsity/linear_layer_sparsity": 0.557376697906078, + "compression/movement_sparsity/model_sparsity": 0.5382290936417206, + "compression_loss": 104.53616333007812, + "distillation_loss": 3.8276565074920654, + "epoch": 1.29, + "learning_rate": 4.352493660185968e-05, + "loss": 107.5411, + "step": 1532, + "task_loss": 2.6111137866973877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9738669564236192, + "compression/movement_sparsity/importance_threshold": -0.0016520732764101565, + "compression/movement_sparsity/linear_layer_sparsity": 0.5592526675793902, + "compression/movement_sparsity/model_sparsity": 0.5400406179855967, + "compression_loss": 104.73873138427734, + "distillation_loss": 2.8210947513580322, + "epoch": 1.3, + "learning_rate": 4.35207100591716e-05, + "loss": 107.4569, + "step": 1533, + "task_loss": 1.7577792406082153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9757549918571271, + "compression/movement_sparsity/importance_threshold": -0.001646144789456721, + "compression/movement_sparsity/linear_layer_sparsity": 0.5609809922090351, + "compression/movement_sparsity/model_sparsity": 0.5417095693472648, + "compression_loss": 104.9408950805664, + "distillation_loss": 3.0811879634857178, + "epoch": 1.3, + "learning_rate": 4.351648351648352e-05, + "loss": 107.7517, + "step": 1534, + "task_loss": 2.302928924560547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9776385050522403, + "compression/movement_sparsity/importance_threshold": -0.0016402305024652395, + "compression/movement_sparsity/linear_layer_sparsity": 0.5628178625366692, + "compression/movement_sparsity/model_sparsity": 0.5434833375282706, + "compression_loss": 105.142578125, + "distillation_loss": 2.310521125793457, + "epoch": 1.3, + "learning_rate": 4.351225697379544e-05, + "loss": 107.4567, + "step": 1535, + "task_loss": 0.9698376655578613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9795175014313072, + "compression/movement_sparsity/importance_threshold": -0.0016343303984093773, + "compression/movement_sparsity/linear_layer_sparsity": 0.56440013996111, + "compression/movement_sparsity/model_sparsity": 0.5450112588555273, + "compression_loss": 105.34381866455078, + "distillation_loss": 2.731828212738037, + "epoch": 1.3, + "learning_rate": 4.350803043110735e-05, + "loss": 108.147, + "step": 1536, + "task_loss": 1.5721303224563599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9813919864166756, + "compression/movement_sparsity/importance_threshold": -0.0016284444602628037, + "compression/movement_sparsity/linear_layer_sparsity": 0.5660561683623787, + "compression/movement_sparsity/model_sparsity": 0.5466103975866731, + "compression_loss": 105.54450225830078, + "distillation_loss": 3.9375834465026855, + "epoch": 1.3, + "learning_rate": 4.350380388841928e-05, + "loss": 109.001, + "step": 1537, + "task_loss": 2.6489920616149902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9832619654306939, + "compression/movement_sparsity/importance_threshold": -0.0016225726709991838, + "compression/movement_sparsity/linear_layer_sparsity": 0.567661256719507, + "compression/movement_sparsity/model_sparsity": 0.5481603462209046, + "compression_loss": 105.74466705322266, + "distillation_loss": 5.179448127746582, + "epoch": 1.3, + "learning_rate": 4.34995773457312e-05, + "loss": 109.7356, + "step": 1538, + "task_loss": 2.8491363525390625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9851274438957104, + "compression/movement_sparsity/importance_threshold": -0.0016167150135921846, + "compression/movement_sparsity/linear_layer_sparsity": 0.5691827566615077, + "compression/movement_sparsity/model_sparsity": 0.5496295779592164, + "compression_loss": 105.94447326660156, + "distillation_loss": 2.8202476501464844, + "epoch": 1.3, + "learning_rate": 4.349535080304311e-05, + "loss": 108.5431, + "step": 1539, + "task_loss": 1.0403130054473877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.986988427234073, + "compression/movement_sparsity/importance_threshold": -0.001610871471015473, + "compression/movement_sparsity/linear_layer_sparsity": 0.5710557691412462, + "compression/movement_sparsity/model_sparsity": 0.5514382466982155, + "compression_loss": 106.14372253417969, + "distillation_loss": 3.666755199432373, + "epoch": 1.3, + "learning_rate": 4.349112426035503e-05, + "loss": 109.5219, + "step": 1540, + "task_loss": 2.089750051498413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9888449208681303, + "compression/movement_sparsity/importance_threshold": -0.0016050420262427147, + "compression/movement_sparsity/linear_layer_sparsity": 0.5728565211651113, + "compression/movement_sparsity/model_sparsity": 0.5531771373502997, + "compression_loss": 106.34252166748047, + "distillation_loss": 4.183623790740967, + "epoch": 1.3, + "learning_rate": 4.348689771766695e-05, + "loss": 109.7301, + "step": 1541, + "task_loss": 2.2211577892303467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9906969302202301, + "compression/movement_sparsity/importance_threshold": -0.001599226662247578, + "compression/movement_sparsity/linear_layer_sparsity": 0.5744524159889923, + "compression/movement_sparsity/model_sparsity": 0.5547182082774336, + "compression_loss": 106.54082489013672, + "distillation_loss": 3.5080056190490723, + "epoch": 1.3, + "learning_rate": 4.348267117497887e-05, + "loss": 109.8182, + "step": 1542, + "task_loss": 2.497711658477783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9925444607127207, + "compression/movement_sparsity/importance_threshold": -0.001593425362003729, + "compression/movement_sparsity/linear_layer_sparsity": 0.5760025816299899, + "compression/movement_sparsity/model_sparsity": 0.5562151209597955, + "compression_loss": 106.7386703491211, + "distillation_loss": 6.2112836837768555, + "epoch": 1.3, + "learning_rate": 4.347844463229079e-05, + "loss": 110.4295, + "step": 1543, + "task_loss": 2.7345328330993652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9943875177679506, + "compression/movement_sparsity/importance_threshold": -0.0015876381084848337, + "compression/movement_sparsity/linear_layer_sparsity": 0.5776402825856022, + "compression/movement_sparsity/model_sparsity": 0.5577965618494252, + "compression_loss": 106.93601989746094, + "distillation_loss": 4.030116081237793, + "epoch": 1.3, + "learning_rate": 4.347421808960271e-05, + "loss": 110.2622, + "step": 1544, + "task_loss": 2.91162109375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9962261068082677, + "compression/movement_sparsity/importance_threshold": -0.0015818648846645596, + "compression/movement_sparsity/linear_layer_sparsity": 0.5792424972183302, + "compression/movement_sparsity/model_sparsity": 0.5593437354805302, + "compression_loss": 107.13289642333984, + "distillation_loss": 3.9510879516601562, + "epoch": 1.31, + "learning_rate": 4.346999154691462e-05, + "loss": 110.1842, + "step": 1545, + "task_loss": 1.7879835367202759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.99806023325602, + "compression/movement_sparsity/importance_threshold": -0.001576105673516574, + "compression/movement_sparsity/linear_layer_sparsity": 0.5809453995225754, + "compression/movement_sparsity/model_sparsity": 0.5609881378518844, + "compression_loss": 107.32929229736328, + "distillation_loss": 3.6610336303710938, + "epoch": 1.31, + "learning_rate": 4.346576500422654e-05, + "loss": 110.3642, + "step": 1546, + "task_loss": 1.9599016904830933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998899025335564, + "compression/movement_sparsity/importance_threshold": -0.001570360458014542, + "compression/movement_sparsity/linear_layer_sparsity": 0.5826692048926863, + "compression/movement_sparsity/model_sparsity": 0.5626527252044864, + "compression_loss": 107.52520751953125, + "distillation_loss": 5.01436710357666, + "epoch": 1.31, + "learning_rate": 4.346153846153846e-05, + "loss": 110.9813, + "step": 1547, + "task_loss": 2.7271504402160645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0017151200632244, + "compression/movement_sparsity/importance_threshold": -0.0015646292211321314, + "compression/movement_sparsity/linear_layer_sparsity": 0.5841610136572739, + "compression/movement_sparsity/model_sparsity": 0.5640932857486698, + "compression_loss": 107.72068786621094, + "distillation_loss": 3.9223690032958984, + "epoch": 1.31, + "learning_rate": 4.345731191885038e-05, + "loss": 110.7925, + "step": 1548, + "task_loss": 2.685188055038452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0035358912673722, + "compression/movement_sparsity/importance_threshold": -0.0015589119458430094, + "compression/movement_sparsity/linear_layer_sparsity": 0.585885856429969, + "compression/movement_sparsity/model_sparsity": 0.5657588748658858, + "compression_loss": 107.91567993164062, + "distillation_loss": 2.9874980449676514, + "epoch": 1.31, + "learning_rate": 4.34530853761623e-05, + "loss": 110.6391, + "step": 1549, + "task_loss": 1.3144748210906982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0053522215683484, + "compression/movement_sparsity/importance_threshold": -0.0015532086151208413, + "compression/movement_sparsity/linear_layer_sparsity": 0.587547477265859, + "compression/movement_sparsity/model_sparsity": 0.5673634139143193, + "compression_loss": 108.11019134521484, + "distillation_loss": 3.407069206237793, + "epoch": 1.31, + "learning_rate": 4.344885883347422e-05, + "loss": 111.0971, + "step": 1550, + "task_loss": 2.027880907058716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0071641163885015, + "compression/movement_sparsity/importance_threshold": -0.0015475192119392936, + "compression/movement_sparsity/linear_layer_sparsity": 0.5890288404595975, + "compression/movement_sparsity/model_sparsity": 0.5687938877251467, + "compression_loss": 108.30413055419922, + "distillation_loss": 4.466436386108398, + "epoch": 1.31, + "learning_rate": 4.344463229078614e-05, + "loss": 111.9754, + "step": 1551, + "task_loss": 2.2840769290924072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0089715811501785, + "compression/movement_sparsity/importance_threshold": -0.0015418437192720358, + "compression/movement_sparsity/linear_layer_sparsity": 0.5905933747225962, + "compression/movement_sparsity/model_sparsity": 0.570304675423141, + "compression_loss": 108.49762725830078, + "distillation_loss": 4.507117748260498, + "epoch": 1.31, + "learning_rate": 4.3440405748098054e-05, + "loss": 111.8826, + "step": 1552, + "task_loss": 1.8480420112609863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.010774621275729, + "compression/movement_sparsity/importance_threshold": -0.0015361821200927314, + "compression/movement_sparsity/linear_layer_sparsity": 0.5921335002144442, + "compression/movement_sparsity/model_sparsity": 0.5717918928663639, + "compression_loss": 108.69068145751953, + "distillation_loss": 3.8605480194091797, + "epoch": 1.31, + "learning_rate": 4.3436179205409974e-05, + "loss": 112.4289, + "step": 1553, + "task_loss": 2.2457542419433594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0125732421875, + "compression/movement_sparsity/importance_threshold": -0.0015305343973750496, + "compression/movement_sparsity/linear_layer_sparsity": 0.59368987834678, + "compression/movement_sparsity/model_sparsity": 0.5732948046218748, + "compression_loss": 108.88323974609375, + "distillation_loss": 3.5598528385162354, + "epoch": 1.31, + "learning_rate": 4.34319526627219e-05, + "loss": 112.2562, + "step": 1554, + "task_loss": 2.8344852924346924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0143674493078403, + "compression/movement_sparsity/importance_threshold": -0.0015249005340926564, + "compression/movement_sparsity/linear_layer_sparsity": 0.595239865125263, + "compression/movement_sparsity/model_sparsity": 0.5747915445861997, + "compression_loss": 109.0752944946289, + "distillation_loss": 3.137166976928711, + "epoch": 1.31, + "learning_rate": 4.3427726120033814e-05, + "loss": 111.8134, + "step": 1555, + "task_loss": 1.3303834199905396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0161572480590984, + "compression/movement_sparsity/importance_threshold": -0.0015192805132192163, + "compression/movement_sparsity/linear_layer_sparsity": 0.5969486937408232, + "compression/movement_sparsity/model_sparsity": 0.5764416696818438, + "compression_loss": 109.26691436767578, + "distillation_loss": 3.54176664352417, + "epoch": 1.32, + "learning_rate": 4.342349957734573e-05, + "loss": 112.3213, + "step": 1556, + "task_loss": 2.223477363586426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0179426438636214, + "compression/movement_sparsity/importance_threshold": -0.0015136743177283996, + "compression/movement_sparsity/linear_layer_sparsity": 0.5986538139402486, + "compression/movement_sparsity/model_sparsity": 0.5780882137568558, + "compression_loss": 109.45797729492188, + "distillation_loss": 3.852586507797241, + "epoch": 1.32, + "learning_rate": 4.341927303465765e-05, + "loss": 112.5126, + "step": 1557, + "task_loss": 2.234609842300415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0197236421437588, + "compression/movement_sparsity/importance_threshold": -0.0015080819305938703, + "compression/movement_sparsity/linear_layer_sparsity": 0.6002549077012189, + "compression/movement_sparsity/model_sparsity": 0.5796343050215962, + "compression_loss": 109.64867401123047, + "distillation_loss": 3.726283073425293, + "epoch": 1.32, + "learning_rate": 4.3415046491969566e-05, + "loss": 112.9503, + "step": 1558, + "task_loss": 1.2220486402511597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0215002483218583, + "compression/movement_sparsity/importance_threshold": -0.0015025033347892958, + "compression/movement_sparsity/linear_layer_sparsity": 0.6018999897165979, + "compression/movement_sparsity/model_sparsity": 0.5812228734088826, + "compression_loss": 109.83879089355469, + "distillation_loss": 3.8369719982147217, + "epoch": 1.32, + "learning_rate": 4.341081994928149e-05, + "loss": 113.2569, + "step": 1559, + "task_loss": 3.256497859954834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0232724678202672, + "compression/movement_sparsity/importance_threshold": -0.0014969385132883452, + "compression/movement_sparsity/linear_layer_sparsity": 0.6035786024913689, + "compression/movement_sparsity/model_sparsity": 0.5828438206708233, + "compression_loss": 110.02848052978516, + "distillation_loss": 4.360718727111816, + "epoch": 1.32, + "learning_rate": 4.340659340659341e-05, + "loss": 114.0015, + "step": 1560, + "task_loss": 3.466217279434204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0250403060613351, + "compression/movement_sparsity/importance_threshold": -0.001491387449064681, + "compression/movement_sparsity/linear_layer_sparsity": 0.6052541388308897, + "compression/movement_sparsity/model_sparsity": 0.584461797182529, + "compression_loss": 110.21770477294922, + "distillation_loss": 4.478077411651611, + "epoch": 1.32, + "learning_rate": 4.3402366863905325e-05, + "loss": 114.0476, + "step": 1561, + "task_loss": 2.728201389312744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0268037684674094, + "compression/movement_sparsity/importance_threshold": -0.0014858501250919727, + "compression/movement_sparsity/linear_layer_sparsity": 0.60700488089569, + "compression/movement_sparsity/model_sparsity": 0.5861523958714908, + "compression_loss": 110.40650177001953, + "distillation_loss": 2.6250550746917725, + "epoch": 1.32, + "learning_rate": 4.3398140321217245e-05, + "loss": 113.4276, + "step": 1562, + "task_loss": 1.7377750873565674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0285628604608383, + "compression/movement_sparsity/importance_threshold": -0.0014803265243438874, + "compression/movement_sparsity/linear_layer_sparsity": 0.6086484008451087, + "compression/movement_sparsity/model_sparsity": 0.5877394558545882, + "compression_loss": 110.5948486328125, + "distillation_loss": 3.2099967002868652, + "epoch": 1.32, + "learning_rate": 4.3393913778529165e-05, + "loss": 114.2914, + "step": 1563, + "task_loss": 1.6697765588760376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0303175874639705, + "compression/movement_sparsity/importance_threshold": -0.0014748166297940896, + "compression/movement_sparsity/linear_layer_sparsity": 0.6102528214488493, + "compression/movement_sparsity/model_sparsity": 0.5892887596748152, + "compression_loss": 110.78262329101562, + "distillation_loss": 2.9184906482696533, + "epoch": 1.32, + "learning_rate": 4.3389687235841084e-05, + "loss": 113.6251, + "step": 1564, + "task_loss": 1.1743499040603638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0320679548991538, + "compression/movement_sparsity/importance_threshold": -0.0014693204244162475, + "compression/movement_sparsity/linear_layer_sparsity": 0.611732968377489, + "compression/movement_sparsity/model_sparsity": 0.5907180590029916, + "compression_loss": 110.97003936767578, + "distillation_loss": 3.2068748474121094, + "epoch": 1.32, + "learning_rate": 4.3385460693153004e-05, + "loss": 113.8487, + "step": 1565, + "task_loss": 1.4513297080993652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0338139681887362, + "compression/movement_sparsity/importance_threshold": -0.0014638378911840289, + "compression/movement_sparsity/linear_layer_sparsity": 0.6131744810029884, + "compression/movement_sparsity/model_sparsity": 0.5921100512351937, + "compression_loss": 111.15686798095703, + "distillation_loss": 4.41751766204834, + "epoch": 1.32, + "learning_rate": 4.3381234150464924e-05, + "loss": 114.8833, + "step": 1566, + "task_loss": 2.231030225753784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0355556327550666, + "compression/movement_sparsity/importance_threshold": -0.0014583690130710973, + "compression/movement_sparsity/linear_layer_sparsity": 0.6147882143816528, + "compression/movement_sparsity/model_sparsity": 0.5936683479078763, + "compression_loss": 111.34326934814453, + "distillation_loss": 6.527504920959473, + "epoch": 1.32, + "learning_rate": 4.3377007607776844e-05, + "loss": 114.8586, + "step": 1567, + "task_loss": 3.6513991355895996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0372929540204925, + "compression/movement_sparsity/importance_threshold": -0.001452913773051122, + "compression/movement_sparsity/linear_layer_sparsity": 0.616481553503454, + "compression/movement_sparsity/model_sparsity": 0.5953035156215232, + "compression_loss": 111.52920532226562, + "distillation_loss": 4.529018402099609, + "epoch": 1.33, + "learning_rate": 4.3372781065088757e-05, + "loss": 114.9943, + "step": 1568, + "task_loss": 2.6245908737182617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.039025937407362, + "compression/movement_sparsity/importance_threshold": -0.0014474721540977701, + "compression/movement_sparsity/linear_layer_sparsity": 0.6181607028657686, + "compression/movement_sparsity/model_sparsity": 0.5969249810375746, + "compression_loss": 111.71471405029297, + "distillation_loss": 3.5183393955230713, + "epoch": 1.33, + "learning_rate": 4.3368554522400676e-05, + "loss": 115.4276, + "step": 1569, + "task_loss": 1.5850911140441895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0407545883380243, + "compression/movement_sparsity/importance_threshold": -0.0014420441391847063, + "compression/movement_sparsity/linear_layer_sparsity": 0.6196274589541532, + "compression/movement_sparsity/model_sparsity": 0.5983413495420536, + "compression_loss": 111.8996810913086, + "distillation_loss": 3.3578598499298096, + "epoch": 1.33, + "learning_rate": 4.3364327979712596e-05, + "loss": 115.9626, + "step": 1570, + "task_loss": 1.2393653392791748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0424789122348264, + "compression/movement_sparsity/importance_threshold": -0.0014366297112855986, + "compression/movement_sparsity/linear_layer_sparsity": 0.6212466774499299, + "compression/movement_sparsity/model_sparsity": 0.5999049429012018, + "compression_loss": 112.084228515625, + "distillation_loss": 4.579760551452637, + "epoch": 1.33, + "learning_rate": 4.3360101437024516e-05, + "loss": 115.9855, + "step": 1571, + "task_loss": 2.1139378547668457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0441989145201171, + "compression/movement_sparsity/importance_threshold": -0.0014312288533741132, + "compression/movement_sparsity/linear_layer_sparsity": 0.6227893666378197, + "compression/movement_sparsity/model_sparsity": 0.6013946359696205, + "compression_loss": 112.2683334350586, + "distillation_loss": 3.9380552768707275, + "epoch": 1.33, + "learning_rate": 4.3355874894336436e-05, + "loss": 115.8131, + "step": 1572, + "task_loss": 1.858887791633606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.045914600616245, + "compression/movement_sparsity/importance_threshold": -0.0014258415484239163, + "compression/movement_sparsity/linear_layer_sparsity": 0.6243043202117885, + "compression/movement_sparsity/model_sparsity": 0.6028575462277811, + "compression_loss": 112.4519271850586, + "distillation_loss": 3.7959330081939697, + "epoch": 1.33, + "learning_rate": 4.3351648351648355e-05, + "loss": 116.1372, + "step": 1573, + "task_loss": 2.0476863384246826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0476259759455573, + "compression/movement_sparsity/importance_threshold": -0.0014204677794086768, + "compression/movement_sparsity/linear_layer_sparsity": 0.6256862716199468, + "compression/movement_sparsity/model_sparsity": 0.6041920233536894, + "compression_loss": 112.63510131835938, + "distillation_loss": 2.433563470840454, + "epoch": 1.33, + "learning_rate": 4.334742180896027e-05, + "loss": 114.9126, + "step": 1574, + "task_loss": 1.2678323984146118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0493330459304029, + "compression/movement_sparsity/importance_threshold": -0.0014151075293020589, + "compression/movement_sparsity/linear_layer_sparsity": 0.6272780168334905, + "compression/movement_sparsity/model_sparsity": 0.6057290872223668, + "compression_loss": 112.81768035888672, + "distillation_loss": 3.221038579940796, + "epoch": 1.33, + "learning_rate": 4.334319526627219e-05, + "loss": 116.8326, + "step": 1575, + "task_loss": 1.185097098350525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0510358159931297, + "compression/movement_sparsity/importance_threshold": -0.001409760781077731, + "compression/movement_sparsity/linear_layer_sparsity": 0.6285827354078714, + "compression/movement_sparsity/model_sparsity": 0.6069889846999339, + "compression_loss": 112.9998779296875, + "distillation_loss": 5.349943161010742, + "epoch": 1.33, + "learning_rate": 4.3338968723584115e-05, + "loss": 116.8133, + "step": 1576, + "task_loss": 2.0130677223205566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.052734291556086, + "compression/movement_sparsity/importance_threshold": -0.0014044275177093595, + "compression/movement_sparsity/linear_layer_sparsity": 0.6301701402243957, + "compression/movement_sparsity/model_sparsity": 0.6085218572775821, + "compression_loss": 113.18162536621094, + "distillation_loss": 3.5576772689819336, + "epoch": 1.33, + "learning_rate": 4.3334742180896034e-05, + "loss": 117.106, + "step": 1577, + "task_loss": 1.9461464881896973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0544284780416202, + "compression/movement_sparsity/importance_threshold": -0.00139910772217061, + "compression/movement_sparsity/linear_layer_sparsity": 0.6317991246134662, + "compression/movement_sparsity/model_sparsity": 0.610094881041546, + "compression_loss": 113.36290740966797, + "distillation_loss": 3.3254685401916504, + "epoch": 1.33, + "learning_rate": 4.333051563820795e-05, + "loss": 117.0264, + "step": 1578, + "task_loss": 1.3175159692764282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0561183808720802, + "compression/movement_sparsity/importance_threshold": -0.00139380137743515, + "compression/movement_sparsity/linear_layer_sparsity": 0.6332216062674189, + "compression/movement_sparsity/model_sparsity": 0.61146849607462, + "compression_loss": 113.54379272460938, + "distillation_loss": 4.478581428527832, + "epoch": 1.33, + "learning_rate": 4.332628909551987e-05, + "loss": 117.2172, + "step": 1579, + "task_loss": 2.7694849967956543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0578040054698143, + "compression/movement_sparsity/importance_threshold": -0.0013885084664766473, + "compression/movement_sparsity/linear_layer_sparsity": 0.6346106406311529, + "compression/movement_sparsity/model_sparsity": 0.6128098128347902, + "compression_loss": 113.72419738769531, + "distillation_loss": 4.755603790283203, + "epoch": 1.34, + "learning_rate": 4.332206255283179e-05, + "loss": 118.0979, + "step": 1580, + "task_loss": 2.0192434787750244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0594853572571705, + "compression/movement_sparsity/importance_threshold": -0.001383228972268767, + "compression/movement_sparsity/linear_layer_sparsity": 0.6360831918650085, + "compression/movement_sparsity/model_sparsity": 0.6142317774036654, + "compression_loss": 113.90418243408203, + "distillation_loss": 3.7032461166381836, + "epoch": 1.34, + "learning_rate": 4.3317836010143706e-05, + "loss": 117.6311, + "step": 1581, + "task_loss": 2.0278055667877197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0611624416564975, + "compression/movement_sparsity/importance_threshold": -0.001377962877785177, + "compression/movement_sparsity/linear_layer_sparsity": 0.6374225262980363, + "compression/movement_sparsity/model_sparsity": 0.615525101578644, + "compression_loss": 114.08367156982422, + "distillation_loss": 4.694839954376221, + "epoch": 1.34, + "learning_rate": 4.3313609467455626e-05, + "loss": 118.2582, + "step": 1582, + "task_loss": 2.7239973545074463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0628352640901428, + "compression/movement_sparsity/importance_threshold": -0.0013727101659995432, + "compression/movement_sparsity/linear_layer_sparsity": 0.63889513715273, + "compression/movement_sparsity/model_sparsity": 0.6169471237201982, + "compression_loss": 114.26268768310547, + "distillation_loss": 3.1247453689575195, + "epoch": 1.34, + "learning_rate": 4.3309382924767546e-05, + "loss": 118.0914, + "step": 1583, + "task_loss": 1.7471249103546143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0645038299804552, + "compression/movement_sparsity/importance_threshold": -0.0013674708198855327, + "compression/movement_sparsity/linear_layer_sparsity": 0.6405620880915532, + "compression/movement_sparsity/model_sparsity": 0.6185568097661318, + "compression_loss": 114.44129943847656, + "distillation_loss": 3.806407928466797, + "epoch": 1.34, + "learning_rate": 4.330515638207946e-05, + "loss": 118.4568, + "step": 1584, + "task_loss": 2.9354496002197266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0661681447497828, + "compression/movement_sparsity/importance_threshold": -0.0013622448224168107, + "compression/movement_sparsity/linear_layer_sparsity": 0.6419462096982212, + "compression/movement_sparsity/model_sparsity": 0.6198933825375548, + "compression_loss": 114.61946105957031, + "distillation_loss": 3.8559744358062744, + "epoch": 1.34, + "learning_rate": 4.330092983939138e-05, + "loss": 118.1757, + "step": 1585, + "task_loss": 1.7772661447525024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0678282138204733, + "compression/movement_sparsity/importance_threshold": -0.0013570321565670473, + "compression/movement_sparsity/linear_layer_sparsity": 0.6433519617449808, + "compression/movement_sparsity/model_sparsity": 0.621250842676909, + "compression_loss": 114.7971420288086, + "distillation_loss": 5.358077526092529, + "epoch": 1.34, + "learning_rate": 4.32967032967033e-05, + "loss": 118.7218, + "step": 1586, + "task_loss": 2.0680317878723145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0694840426148755, + "compression/movement_sparsity/importance_threshold": -0.0013518328053099059, + "compression/movement_sparsity/linear_layer_sparsity": 0.6448976081264443, + "compression/movement_sparsity/model_sparsity": 0.6227433913502047, + "compression_loss": 114.97431945800781, + "distillation_loss": 3.985830783843994, + "epoch": 1.34, + "learning_rate": 4.329247675401522e-05, + "loss": 118.1344, + "step": 1587, + "task_loss": 1.5103660821914673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0711356365553373, + "compression/movement_sparsity/importance_threshold": -0.0013466467516190551, + "compression/movement_sparsity/linear_layer_sparsity": 0.646425725981483, + "compression/movement_sparsity/model_sparsity": 0.6242190136558825, + "compression_loss": 115.15105438232422, + "distillation_loss": 3.744576930999756, + "epoch": 1.34, + "learning_rate": 4.328825021132714e-05, + "loss": 118.8127, + "step": 1588, + "task_loss": 1.76736319065094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0727830010642068, + "compression/movement_sparsity/importance_threshold": -0.0013414739784681617, + "compression/movement_sparsity/linear_layer_sparsity": 0.6479528779789433, + "compression/movement_sparsity/model_sparsity": 0.625693703284161, + "compression_loss": 115.32734680175781, + "distillation_loss": 4.767174243927002, + "epoch": 1.34, + "learning_rate": 4.328402366863906e-05, + "loss": 119.5958, + "step": 1589, + "task_loss": 1.9313116073608398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.074426141563833, + "compression/movement_sparsity/importance_threshold": -0.0013363144688308898, + "compression/movement_sparsity/linear_layer_sparsity": 0.6493796284849095, + "compression/movement_sparsity/model_sparsity": 0.6270714405210494, + "compression_loss": 115.50312042236328, + "distillation_loss": 4.791862964630127, + "epoch": 1.34, + "learning_rate": 4.327979712595097e-05, + "loss": 119.7996, + "step": 1590, + "task_loss": 3.1982789039611816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0760650634765625, + "compression/movement_sparsity/importance_threshold": -0.0013311682056809104, + "compression/movement_sparsity/linear_layer_sparsity": 0.6507805273954412, + "compression/movement_sparsity/model_sparsity": 0.6284242142443351, + "compression_loss": 115.67848205566406, + "distillation_loss": 3.7637436389923096, + "epoch": 1.34, + "learning_rate": 4.327557058326289e-05, + "loss": 119.6238, + "step": 1591, + "task_loss": 2.3045530319213867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0776997722247448, + "compression/movement_sparsity/importance_threshold": -0.0013260351719918875, + "compression/movement_sparsity/linear_layer_sparsity": 0.6523456459426541, + "compression/movement_sparsity/model_sparsity": 0.6299355661545833, + "compression_loss": 115.85338592529297, + "distillation_loss": 2.95743465423584, + "epoch": 1.35, + "learning_rate": 4.327134404057481e-05, + "loss": 119.8796, + "step": 1592, + "task_loss": 2.1221401691436768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.079330273230728, + "compression/movement_sparsity/importance_threshold": -0.001320915350737487, + "compression/movement_sparsity/linear_layer_sparsity": 0.6538129266944147, + "compression/movement_sparsity/model_sparsity": 0.6313524412986374, + "compression_loss": 116.02781677246094, + "distillation_loss": 5.060418128967285, + "epoch": 1.35, + "learning_rate": 4.3267117497886737e-05, + "loss": 119.7491, + "step": 1593, + "task_loss": 2.6820931434631348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0809565719168597, + "compression/movement_sparsity/importance_threshold": -0.0013158087248913782, + "compression/movement_sparsity/linear_layer_sparsity": 0.6554549799712142, + "compression/movement_sparsity/model_sparsity": 0.6329380849938321, + "compression_loss": 116.20174407958984, + "distillation_loss": 2.4874887466430664, + "epoch": 1.35, + "learning_rate": 4.326289095519865e-05, + "loss": 120.4098, + "step": 1594, + "task_loss": 2.1427178382873535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0825786737054883, + "compression/movement_sparsity/importance_threshold": -0.0013107152774272262, + "compression/movement_sparsity/linear_layer_sparsity": 0.6567134566235031, + "compression/movement_sparsity/model_sparsity": 0.634153329101588, + "compression_loss": 116.37520599365234, + "distillation_loss": 3.4833521842956543, + "epoch": 1.35, + "learning_rate": 4.325866441251057e-05, + "loss": 120.488, + "step": 1595, + "task_loss": 2.1483876705169678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0841965840189625, + "compression/movement_sparsity/importance_threshold": -0.0013056349913186977, + "compression/movement_sparsity/linear_layer_sparsity": 0.6581914810503036, + "compression/movement_sparsity/model_sparsity": 0.6355805788423928, + "compression_loss": 116.54830169677734, + "distillation_loss": 4.280088901519775, + "epoch": 1.35, + "learning_rate": 4.325443786982249e-05, + "loss": 120.4263, + "step": 1596, + "task_loss": 1.8133630752563477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0858103082796298, + "compression/movement_sparsity/importance_threshold": -0.0013005678495394603, + "compression/movement_sparsity/linear_layer_sparsity": 0.6596304657522643, + "compression/movement_sparsity/model_sparsity": 0.6369701299930064, + "compression_loss": 116.7208480834961, + "distillation_loss": 4.008568286895752, + "epoch": 1.35, + "learning_rate": 4.32502113271344e-05, + "loss": 120.6723, + "step": 1597, + "task_loss": 2.8913395404815674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0874198519098388, + "compression/movement_sparsity/importance_threshold": -0.00129551383506318, + "compression/movement_sparsity/linear_layer_sparsity": 0.6611544697694687, + "compression/movement_sparsity/model_sparsity": 0.6384417797838351, + "compression_loss": 116.89300537109375, + "distillation_loss": 4.252841472625732, + "epoch": 1.35, + "learning_rate": 4.324598478444633e-05, + "loss": 120.5307, + "step": 1598, + "task_loss": 2.6416594982147217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0890252203319377, + "compression/movement_sparsity/importance_threshold": -0.0012904729308635234, + "compression/movement_sparsity/linear_layer_sparsity": 0.6625170205568834, + "compression/movement_sparsity/model_sparsity": 0.6397575227600056, + "compression_loss": 117.06465911865234, + "distillation_loss": 3.74229097366333, + "epoch": 1.35, + "learning_rate": 4.324175824175825e-05, + "loss": 121.2333, + "step": 1599, + "task_loss": 2.898186683654785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0906264189682742, + "compression/movement_sparsity/importance_threshold": -0.0012854451199141585, + "compression/movement_sparsity/linear_layer_sparsity": 0.6637171522569302, + "compression/movement_sparsity/model_sparsity": 0.6409164262441188, + "compression_loss": 117.23584747314453, + "distillation_loss": 4.659516334533691, + "epoch": 1.35, + "learning_rate": 4.323753169907016e-05, + "loss": 120.671, + "step": 1600, + "task_loss": 2.0339272022247314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0922234532411974, + "compression/movement_sparsity/importance_threshold": -0.0012804303851887493, + "compression/movement_sparsity/linear_layer_sparsity": 0.6650139531840008, + "compression/movement_sparsity/model_sparsity": 0.6421686780699183, + "compression_loss": 117.40653991699219, + "distillation_loss": 4.822331428527832, + "epoch": 1.35, + "learning_rate": 4.323330515638208e-05, + "loss": 122.4939, + "step": 1601, + "task_loss": 2.14998197555542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0938163285730549, + "compression/movement_sparsity/importance_threshold": -0.0012754287096609653, + "compression/movement_sparsity/linear_layer_sparsity": 0.6665544840975485, + "compression/movement_sparsity/model_sparsity": 0.6436562870073582, + "compression_loss": 117.57678985595703, + "distillation_loss": 5.297080039978027, + "epoch": 1.35, + "learning_rate": 4.3229078613694e-05, + "loss": 121.7672, + "step": 1602, + "task_loss": 3.3388876914978027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0954050503861945, + "compression/movement_sparsity/importance_threshold": -0.001270440076304473, + "compression/movement_sparsity/linear_layer_sparsity": 0.6678812027612175, + "compression/movement_sparsity/model_sparsity": 0.6449374288034662, + "compression_loss": 117.74663543701172, + "distillation_loss": 4.490848064422607, + "epoch": 1.35, + "learning_rate": 4.3224852071005914e-05, + "loss": 121.8935, + "step": 1603, + "task_loss": 1.8725887537002563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.096989624102965, + "compression/movement_sparsity/importance_threshold": -0.0012654644680929377, + "compression/movement_sparsity/linear_layer_sparsity": 0.6692411183075848, + "compression/movement_sparsity/model_sparsity": 0.6462506270672262, + "compression_loss": 117.91603088378906, + "distillation_loss": 3.412388801574707, + "epoch": 1.36, + "learning_rate": 4.322062552831784e-05, + "loss": 122.3655, + "step": 1604, + "task_loss": 2.1094701290130615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0985700551457152, + "compression/movement_sparsity/importance_threshold": -0.0012605018680000258, + "compression/movement_sparsity/linear_layer_sparsity": 0.6707111177695663, + "compression/movement_sparsity/model_sparsity": 0.6476701275254414, + "compression_loss": 118.08499908447266, + "distillation_loss": 3.7808332443237305, + "epoch": 1.36, + "learning_rate": 4.321639898562976e-05, + "loss": 122.0265, + "step": 1605, + "task_loss": 2.498356819152832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1001463489367915, + "compression/movement_sparsity/importance_threshold": -0.0012555522589994066, + "compression/movement_sparsity/linear_layer_sparsity": 0.6719697255876993, + "compression/movement_sparsity/model_sparsity": 0.648885498293091, + "compression_loss": 118.25349426269531, + "distillation_loss": 3.7680885791778564, + "epoch": 1.36, + "learning_rate": 4.321217244294168e-05, + "loss": 122.624, + "step": 1606, + "task_loss": 2.3853378295898438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.101718510898544, + "compression/movement_sparsity/importance_threshold": -0.001250615624064744, + "compression/movement_sparsity/linear_layer_sparsity": 0.6733138296877814, + "compression/movement_sparsity/model_sparsity": 0.6501834282823874, + "compression_loss": 118.42152404785156, + "distillation_loss": 5.073239803314209, + "epoch": 1.36, + "learning_rate": 4.320794590025359e-05, + "loss": 122.7151, + "step": 1607, + "task_loss": 2.7709081172943115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1032865464533197, + "compression/movement_sparsity/importance_threshold": -0.0012456919461697061, + "compression/movement_sparsity/linear_layer_sparsity": 0.6748297610434963, + "compression/movement_sparsity/model_sparsity": 0.6516472827324832, + "compression_loss": 118.58906555175781, + "distillation_loss": 5.1621527671813965, + "epoch": 1.36, + "learning_rate": 4.320371935756551e-05, + "loss": 122.771, + "step": 1608, + "task_loss": 2.8966009616851807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1048504610234668, + "compression/movement_sparsity/importance_threshold": -0.00124078120828796, + "compression/movement_sparsity/linear_layer_sparsity": 0.6759656334041534, + "compression/movement_sparsity/model_sparsity": 0.6527441343831996, + "compression_loss": 118.75621795654297, + "distillation_loss": 4.108824253082275, + "epoch": 1.36, + "learning_rate": 4.319949281487743e-05, + "loss": 123.11, + "step": 1609, + "task_loss": 2.221781015396118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1064102600313346, + "compression/movement_sparsity/importance_threshold": -0.001235883393393171, + "compression/movement_sparsity/linear_layer_sparsity": 0.6774813143523479, + "compression/movement_sparsity/model_sparsity": 0.6542077470280436, + "compression_loss": 118.92295837402344, + "distillation_loss": 5.164173603057861, + "epoch": 1.36, + "learning_rate": 4.319526627218935e-05, + "loss": 122.8529, + "step": 1610, + "task_loss": 2.004044532775879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1079659488992701, + "compression/movement_sparsity/importance_threshold": -0.0012309984844590073, + "compression/movement_sparsity/linear_layer_sparsity": 0.6790146430686436, + "compression/movement_sparsity/model_sparsity": 0.6556884011858636, + "compression_loss": 119.08917236328125, + "distillation_loss": 7.246700763702393, + "epoch": 1.36, + "learning_rate": 4.319103972950127e-05, + "loss": 123.8255, + "step": 1611, + "task_loss": 4.234955787658691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1095175330496219, + "compression/movement_sparsity/importance_threshold": -0.001226126464459135, + "compression/movement_sparsity/linear_layer_sparsity": 0.6804013641438562, + "compression/movement_sparsity/model_sparsity": 0.6570274841260897, + "compression_loss": 119.25492858886719, + "distillation_loss": 3.6851930618286133, + "epoch": 1.36, + "learning_rate": 4.318681318681319e-05, + "loss": 123.2515, + "step": 1612, + "task_loss": 3.4537675380706787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1110650179047383, + "compression/movement_sparsity/importance_threshold": -0.0012212673163672205, + "compression/movement_sparsity/linear_layer_sparsity": 0.6816465572733988, + "compression/movement_sparsity/model_sparsity": 0.6582299010409705, + "compression_loss": 119.42024230957031, + "distillation_loss": 2.9697279930114746, + "epoch": 1.36, + "learning_rate": 4.3182586644125104e-05, + "loss": 123.0545, + "step": 1613, + "task_loss": 1.1787941455841064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1126084088869674, + "compression/movement_sparsity/importance_threshold": -0.0012164210231569313, + "compression/movement_sparsity/linear_layer_sparsity": 0.682951788586988, + "compression/movement_sparsity/model_sparsity": 0.6594902936435767, + "compression_loss": 119.5850601196289, + "distillation_loss": 4.07765531539917, + "epoch": 1.36, + "learning_rate": 4.3178360101437024e-05, + "loss": 124.2977, + "step": 1614, + "task_loss": 2.54297137260437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1141477114186573, + "compression/movement_sparsity/importance_threshold": -0.0012115875678019332, + "compression/movement_sparsity/linear_layer_sparsity": 0.6842453342162941, + "compression/movement_sparsity/model_sparsity": 0.6607394020011043, + "compression_loss": 119.74951171875, + "distillation_loss": 4.686488151550293, + "epoch": 1.36, + "learning_rate": 4.317413355874895e-05, + "loss": 123.9731, + "step": 1615, + "task_loss": 2.6008150577545166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1156829309221568, + "compression/movement_sparsity/importance_threshold": -0.0012067669332758934, + "compression/movement_sparsity/linear_layer_sparsity": 0.6855348733252745, + "compression/movement_sparsity/model_sparsity": 0.6619846414746049, + "compression_loss": 119.91354370117188, + "distillation_loss": 2.927152156829834, + "epoch": 1.37, + "learning_rate": 4.3169907016060863e-05, + "loss": 124.3327, + "step": 1616, + "task_loss": 1.656904697418213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.117214072819813, + "compression/movement_sparsity/importance_threshold": -0.0012019591025524794, + "compression/movement_sparsity/linear_layer_sparsity": 0.6869271749109408, + "compression/movement_sparsity/model_sparsity": 0.6633291132175828, + "compression_loss": 120.07717895507812, + "distillation_loss": 4.406262397766113, + "epoch": 1.37, + "learning_rate": 4.316568047337278e-05, + "loss": 123.9187, + "step": 1617, + "task_loss": 2.220959186553955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.118741142533975, + "compression/movement_sparsity/importance_threshold": -0.001197164058605356, + "compression/movement_sparsity/linear_layer_sparsity": 0.6880727177715505, + "compression/movement_sparsity/model_sparsity": 0.6644353031568286, + "compression_loss": 120.24034118652344, + "distillation_loss": 4.4984517097473145, + "epoch": 1.37, + "learning_rate": 4.31614539306847e-05, + "loss": 124.8044, + "step": 1618, + "task_loss": 1.910883903503418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1202641454869908, + "compression/movement_sparsity/importance_threshold": -0.0011923817844081913, + "compression/movement_sparsity/linear_layer_sparsity": 0.689435280483133, + "compression/movement_sparsity/model_sparsity": 0.665751057647535, + "compression_loss": 120.4030532836914, + "distillation_loss": 5.075961112976074, + "epoch": 1.37, + "learning_rate": 4.3157227387996616e-05, + "loss": 124.3477, + "step": 1619, + "task_loss": 2.0177574157714844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1217830871012082, + "compression/movement_sparsity/importance_threshold": -0.0011876122629346526, + "compression/movement_sparsity/linear_layer_sparsity": 0.6906907283968424, + "compression/movement_sparsity/model_sparsity": 0.666963377063199, + "compression_loss": 120.5652847290039, + "distillation_loss": 3.847567081451416, + "epoch": 1.37, + "learning_rate": 4.3153000845308536e-05, + "loss": 125.3068, + "step": 1620, + "task_loss": 0.9891605377197266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1232979727989758, + "compression/movement_sparsity/importance_threshold": -0.0011828554771584053, + "compression/movement_sparsity/linear_layer_sparsity": 0.6920094101715248, + "compression/movement_sparsity/model_sparsity": 0.6682367580621814, + "compression_loss": 120.72709655761719, + "distillation_loss": 4.334209442138672, + "epoch": 1.37, + "learning_rate": 4.314877430262046e-05, + "loss": 125.268, + "step": 1621, + "task_loss": 3.0601370334625244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1248088080026417, + "compression/movement_sparsity/importance_threshold": -0.0011781114100531164, + "compression/movement_sparsity/linear_layer_sparsity": 0.6931947797520384, + "compression/movement_sparsity/model_sparsity": 0.669381406550981, + "compression_loss": 120.88846588134766, + "distillation_loss": 5.940729141235352, + "epoch": 1.37, + "learning_rate": 4.314454775993238e-05, + "loss": 125.5652, + "step": 1622, + "task_loss": 2.800605535507202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.126315598134554, + "compression/movement_sparsity/importance_threshold": -0.0011733800445924533, + "compression/movement_sparsity/linear_layer_sparsity": 0.694547481176986, + "compression/movement_sparsity/model_sparsity": 0.6706876385205852, + "compression_loss": 121.04940795898438, + "distillation_loss": 3.9292197227478027, + "epoch": 1.37, + "learning_rate": 4.3140321217244295e-05, + "loss": 125.5798, + "step": 1623, + "task_loss": 2.5937817096710205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.127818348617061, + "compression/movement_sparsity/importance_threshold": -0.0011686613637500818, + "compression/movement_sparsity/linear_layer_sparsity": 0.6956885643988998, + "compression/movement_sparsity/model_sparsity": 0.6717895220234439, + "compression_loss": 121.20983123779297, + "distillation_loss": 5.505153656005859, + "epoch": 1.37, + "learning_rate": 4.3136094674556215e-05, + "loss": 126.286, + "step": 1624, + "task_loss": 3.20289945602417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.129317064872511, + "compression/movement_sparsity/importance_threshold": -0.0011639553504996692, + "compression/movement_sparsity/linear_layer_sparsity": 0.6968212649309657, + "compression/movement_sparsity/model_sparsity": 0.6728833108076389, + "compression_loss": 121.3698959350586, + "distillation_loss": 3.1672205924987793, + "epoch": 1.37, + "learning_rate": 4.3131868131868134e-05, + "loss": 125.4565, + "step": 1625, + "task_loss": 1.7627214193344116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.130811752323252, + "compression/movement_sparsity/importance_threshold": -0.0011592619878148828, + "compression/movement_sparsity/linear_layer_sparsity": 0.6981937962466918, + "compression/movement_sparsity/model_sparsity": 0.6742086914502695, + "compression_loss": 121.52953338623047, + "distillation_loss": 4.028879165649414, + "epoch": 1.37, + "learning_rate": 4.3127641589180054e-05, + "loss": 125.9492, + "step": 1626, + "task_loss": 1.833629846572876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1323024163916324, + "compression/movement_sparsity/importance_threshold": -0.0011545812586693871, + "compression/movement_sparsity/linear_layer_sparsity": 0.6994968692859389, + "compression/movement_sparsity/model_sparsity": 0.6754669999218968, + "compression_loss": 121.68870544433594, + "distillation_loss": 5.096287727355957, + "epoch": 1.38, + "learning_rate": 4.3123415046491974e-05, + "loss": 126.73, + "step": 1627, + "task_loss": 2.98466157913208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1337890625, + "compression/movement_sparsity/importance_threshold": -0.0011499131460368517, + "compression/movement_sparsity/linear_layer_sparsity": 0.7007197522978348, + "compression/movement_sparsity/model_sparsity": 0.676647873140306, + "compression_loss": 121.8475112915039, + "distillation_loss": 4.1848344802856445, + "epoch": 1.38, + "learning_rate": 4.3119188503803893e-05, + "loss": 125.5736, + "step": 1628, + "task_loss": 2.1423747539520264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1352716960707032, + "compression/movement_sparsity/importance_threshold": -0.0011452576328909421, + "compression/movement_sparsity/linear_layer_sparsity": 0.7016915361876547, + "compression/movement_sparsity/model_sparsity": 0.6775862732639537, + "compression_loss": 122.00588989257812, + "distillation_loss": 4.5834574699401855, + "epoch": 1.38, + "learning_rate": 4.3114961961115806e-05, + "loss": 126.795, + "step": 1629, + "task_loss": 2.2282190322875977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1367503225260904, + "compression/movement_sparsity/importance_threshold": -0.0011406147022053237, + "compression/movement_sparsity/linear_layer_sparsity": 0.7031004362146701, + "compression/movement_sparsity/model_sparsity": 0.6789467732407577, + "compression_loss": 122.16371154785156, + "distillation_loss": 5.430883407592773, + "epoch": 1.38, + "learning_rate": 4.3110735418427726e-05, + "loss": 126.6811, + "step": 1630, + "task_loss": 2.839416027069092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1382249472885098, + "compression/movement_sparsity/importance_threshold": -0.001135984336953665, + "compression/movement_sparsity/linear_layer_sparsity": 0.7045009893243405, + "compression/movement_sparsity/model_sparsity": 0.6802992130425055, + "compression_loss": 122.32119750976562, + "distillation_loss": 5.543839454650879, + "epoch": 1.38, + "learning_rate": 4.3106508875739646e-05, + "loss": 127.0583, + "step": 1631, + "task_loss": 3.2805869579315186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1396955757803091, + "compression/movement_sparsity/importance_threshold": -0.0011313665201096328, + "compression/movement_sparsity/linear_layer_sparsity": 0.7058499704089856, + "compression/movement_sparsity/model_sparsity": 0.6816018524769419, + "compression_loss": 122.47811889648438, + "distillation_loss": 4.163388729095459, + "epoch": 1.38, + "learning_rate": 4.3102282333051566e-05, + "loss": 126.7668, + "step": 1632, + "task_loss": 2.362947702407837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1411622134238373, + "compression/movement_sparsity/importance_threshold": -0.0011267612346468918, + "compression/movement_sparsity/linear_layer_sparsity": 0.7071779768827593, + "compression/movement_sparsity/model_sparsity": 0.6828842378429156, + "compression_loss": 122.63465881347656, + "distillation_loss": 5.2354655265808105, + "epoch": 1.38, + "learning_rate": 4.3098055790363485e-05, + "loss": 127.0408, + "step": 1633, + "task_loss": 2.386781930923462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1426248656414415, + "compression/movement_sparsity/importance_threshold": -0.0011221684635391118, + "compression/movement_sparsity/linear_layer_sparsity": 0.7083952555358664, + "compression/movement_sparsity/model_sparsity": 0.6840596992295014, + "compression_loss": 122.79078674316406, + "distillation_loss": 6.934826374053955, + "epoch": 1.38, + "learning_rate": 4.3093829247675405e-05, + "loss": 127.7497, + "step": 1634, + "task_loss": 3.19840931892395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1440835378554708, + "compression/movement_sparsity/importance_threshold": -0.0011175881897599565, + "compression/movement_sparsity/linear_layer_sparsity": 0.709663402688108, + "compression/movement_sparsity/model_sparsity": 0.6852842816257866, + "compression_loss": 122.94646453857422, + "distillation_loss": 4.748937606811523, + "epoch": 1.38, + "learning_rate": 4.3089602704987325e-05, + "loss": 127.7195, + "step": 1635, + "task_loss": 2.836721658706665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1455382354882733, + "compression/movement_sparsity/importance_threshold": -0.0011130203962830937, + "compression/movement_sparsity/linear_layer_sparsity": 0.7108427028672949, + "compression/movement_sparsity/model_sparsity": 0.6864230692158667, + "compression_loss": 123.10173034667969, + "distillation_loss": 3.8511111736297607, + "epoch": 1.38, + "learning_rate": 4.308537616229924e-05, + "loss": 127.113, + "step": 1636, + "task_loss": 2.2419724464416504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1469889639621966, + "compression/movement_sparsity/importance_threshold": -0.0011084650660821917, + "compression/movement_sparsity/linear_layer_sparsity": 0.712111326986242, + "compression/movement_sparsity/model_sparsity": 0.6876481121935837, + "compression_loss": 123.25662994384766, + "distillation_loss": 4.497777462005615, + "epoch": 1.38, + "learning_rate": 4.308114961961116e-05, + "loss": 127.6351, + "step": 1637, + "task_loss": 1.9297791719436646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1484357286995897, + "compression/movement_sparsity/importance_threshold": -0.0011039221821309148, + "compression/movement_sparsity/linear_layer_sparsity": 0.7133632572704989, + "compression/movement_sparsity/model_sparsity": 0.6888570348211883, + "compression_loss": 123.41108703613281, + "distillation_loss": 3.224041223526001, + "epoch": 1.38, + "learning_rate": 4.3076923076923084e-05, + "loss": 126.7166, + "step": 1638, + "task_loss": 2.079244613647461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1498785351228002, + "compression/movement_sparsity/importance_threshold": -0.0010993917274029312, + "compression/movement_sparsity/linear_layer_sparsity": 0.7146344450854877, + "compression/movement_sparsity/model_sparsity": 0.6900845534241011, + "compression_loss": 123.5650863647461, + "distillation_loss": 5.317746162414551, + "epoch": 1.39, + "learning_rate": 4.3072696534235e-05, + "loss": 127.8603, + "step": 1639, + "task_loss": 2.426868200302124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1513173886541763, + "compression/movement_sparsity/importance_threshold": -0.001094873684871908, + "compression/movement_sparsity/linear_layer_sparsity": 0.7158877228006875, + "compression/movement_sparsity/model_sparsity": 0.6912947771942505, + "compression_loss": 123.71871185302734, + "distillation_loss": 5.781313419342041, + "epoch": 1.39, + "learning_rate": 4.306846999154692e-05, + "loss": 128.8065, + "step": 1640, + "task_loss": 2.936715602874756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1527522947160667, + "compression/movement_sparsity/importance_threshold": -0.00109036803751151, + "compression/movement_sparsity/linear_layer_sparsity": 0.7172322442466368, + "compression/movement_sparsity/model_sparsity": 0.6925931101922997, + "compression_loss": 123.87187194824219, + "distillation_loss": 6.47765588760376, + "epoch": 1.39, + "learning_rate": 4.3064243448858837e-05, + "loss": 129.0338, + "step": 1641, + "task_loss": 3.2811150550842285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1541832587308194, + "compression/movement_sparsity/importance_threshold": -0.0010858747682954049, + "compression/movement_sparsity/linear_layer_sparsity": 0.7184827794032803, + "compression/movement_sparsity/model_sparsity": 0.6938006856192165, + "compression_loss": 124.0246810913086, + "distillation_loss": 4.720431804656982, + "epoch": 1.39, + "learning_rate": 4.306001690617075e-05, + "loss": 128.9026, + "step": 1642, + "task_loss": 2.5120980739593506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.155610286120782, + "compression/movement_sparsity/importance_threshold": -0.0010813938601972606, + "compression/movement_sparsity/linear_layer_sparsity": 0.7197101220538721, + "compression/movement_sparsity/model_sparsity": 0.6949858652740127, + "compression_loss": 124.17704010009766, + "distillation_loss": 5.344167709350586, + "epoch": 1.39, + "learning_rate": 4.3055790363482676e-05, + "loss": 129.3675, + "step": 1643, + "task_loss": 2.4594790935516357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1570333823083034, + "compression/movement_sparsity/importance_threshold": -0.0010769252961907423, + "compression/movement_sparsity/linear_layer_sparsity": 0.720708234505832, + "compression/movement_sparsity/model_sparsity": 0.6959496894926948, + "compression_loss": 124.32897186279297, + "distillation_loss": 3.9895737171173096, + "epoch": 1.39, + "learning_rate": 4.3051563820794596e-05, + "loss": 128.8685, + "step": 1644, + "task_loss": 1.6441888809204102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1584525527157319, + "compression/movement_sparsity/importance_threshold": -0.0010724690592495167, + "compression/movement_sparsity/linear_layer_sparsity": 0.7219841442912046, + "compression/movement_sparsity/model_sparsity": 0.6971817678517822, + "compression_loss": 124.48047637939453, + "distillation_loss": 5.422521591186523, + "epoch": 1.39, + "learning_rate": 4.304733727810651e-05, + "loss": 129.1549, + "step": 1645, + "task_loss": 2.982480049133301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1598678027654148, + "compression/movement_sparsity/importance_threshold": -0.0010680251323472519, + "compression/movement_sparsity/linear_layer_sparsity": 0.7231102269102325, + "compression/movement_sparsity/model_sparsity": 0.6982691660686112, + "compression_loss": 124.63160705566406, + "distillation_loss": 3.476102590560913, + "epoch": 1.39, + "learning_rate": 4.304311073541843e-05, + "loss": 129.4896, + "step": 1646, + "task_loss": 1.5611273050308228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1612791378797014, + "compression/movement_sparsity/importance_threshold": -0.0010635934984576131, + "compression/movement_sparsity/linear_layer_sparsity": 0.7242255539300531, + "compression/movement_sparsity/model_sparsity": 0.6993461781741537, + "compression_loss": 124.78230285644531, + "distillation_loss": 4.061097621917725, + "epoch": 1.39, + "learning_rate": 4.303888419273035e-05, + "loss": 129.3975, + "step": 1647, + "task_loss": 3.164276123046875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.162686563480939, + "compression/movement_sparsity/importance_threshold": -0.0010591741405542683, + "compression/movement_sparsity/linear_layer_sparsity": 0.7253236266793044, + "compression/movement_sparsity/model_sparsity": 0.7004065287464013, + "compression_loss": 124.93258666992188, + "distillation_loss": 5.3908233642578125, + "epoch": 1.39, + "learning_rate": 4.303465765004227e-05, + "loss": 129.998, + "step": 1648, + "task_loss": 3.5379457473754883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1640900849914761, + "compression/movement_sparsity/importance_threshold": -0.0010547670416108834, + "compression/movement_sparsity/linear_layer_sparsity": 0.7264349710271344, + "compression/movement_sparsity/model_sparsity": 0.7014796949969884, + "compression_loss": 125.0823745727539, + "distillation_loss": 3.8984055519104004, + "epoch": 1.39, + "learning_rate": 4.303043110735419e-05, + "loss": 129.2775, + "step": 1649, + "task_loss": 2.8166308403015137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1654897078336612, + "compression/movement_sparsity/importance_threshold": -0.0010503721846011251, + "compression/movement_sparsity/linear_layer_sparsity": 0.7276892980690861, + "compression/movement_sparsity/model_sparsity": 0.7026909320462876, + "compression_loss": 125.23187255859375, + "distillation_loss": 4.039017677307129, + "epoch": 1.39, + "learning_rate": 4.302620456466611e-05, + "loss": 129.7654, + "step": 1650, + "task_loss": 1.8861942291259766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1668854374298419, + "compression/movement_sparsity/importance_threshold": -0.0010459895524986612, + "compression/movement_sparsity/linear_layer_sparsity": 0.7286833563040499, + "compression/movement_sparsity/model_sparsity": 0.7036508413227995, + "compression_loss": 125.38087463378906, + "distillation_loss": 5.553791046142578, + "epoch": 1.4, + "learning_rate": 4.302197802197803e-05, + "loss": 130.7731, + "step": 1651, + "task_loss": 1.9815449714660645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.168277279202367, + "compression/movement_sparsity/importance_threshold": -0.0010416191282771564, + "compression/movement_sparsity/linear_layer_sparsity": 0.7298072806487358, + "compression/movement_sparsity/model_sparsity": 0.7047361554086498, + "compression_loss": 125.52945709228516, + "distillation_loss": 4.8394060134887695, + "epoch": 1.4, + "learning_rate": 4.301775147928994e-05, + "loss": 129.5692, + "step": 1652, + "task_loss": 1.6050645112991333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1696652385735846, + "compression/movement_sparsity/importance_threshold": -0.001037260894910279, + "compression/movement_sparsity/linear_layer_sparsity": 0.7307350166633089, + "compression/movement_sparsity/model_sparsity": 0.7056320208370724, + "compression_loss": 125.67767333984375, + "distillation_loss": 3.5678977966308594, + "epoch": 1.4, + "learning_rate": 4.301352493660186e-05, + "loss": 130.6722, + "step": 1653, + "task_loss": 1.6422040462493896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1710493209658424, + "compression/movement_sparsity/importance_threshold": -0.0010329148353716957, + "compression/movement_sparsity/linear_layer_sparsity": 0.7318193050747731, + "compression/movement_sparsity/model_sparsity": 0.7066790606059417, + "compression_loss": 125.82548522949219, + "distillation_loss": 5.687063217163086, + "epoch": 1.4, + "learning_rate": 4.300929839391378e-05, + "loss": 130.928, + "step": 1654, + "task_loss": 2.9897544384002686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.172429531801489, + "compression/movement_sparsity/importance_threshold": -0.0010285809326350723, + "compression/movement_sparsity/linear_layer_sparsity": 0.7328589017059381, + "compression/movement_sparsity/model_sparsity": 0.7076829438946529, + "compression_loss": 125.97283935546875, + "distillation_loss": 6.145558834075928, + "epoch": 1.4, + "learning_rate": 4.30050718512257e-05, + "loss": 130.9872, + "step": 1655, + "task_loss": 2.6128365993499756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1738058765028727, + "compression/movement_sparsity/importance_threshold": -0.0010242591696740765, + "compression/movement_sparsity/linear_layer_sparsity": 0.7340143058531827, + "compression/movement_sparsity/model_sparsity": 0.7087986563550007, + "compression_loss": 126.11978912353516, + "distillation_loss": 5.220210075378418, + "epoch": 1.4, + "learning_rate": 4.300084530853762e-05, + "loss": 131.198, + "step": 1656, + "task_loss": 2.4230151176452637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1751783604923411, + "compression/movement_sparsity/importance_threshold": -0.001019949529462375, + "compression/movement_sparsity/linear_layer_sparsity": 0.7351830531440119, + "compression/movement_sparsity/model_sparsity": 0.7099272535809026, + "compression_loss": 126.26640319824219, + "distillation_loss": 5.404793739318848, + "epoch": 1.4, + "learning_rate": 4.299661876584954e-05, + "loss": 130.8484, + "step": 1657, + "task_loss": 3.106280565261841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.176546989192243, + "compression/movement_sparsity/importance_threshold": -0.0010156519949736337, + "compression/movement_sparsity/linear_layer_sparsity": 0.7362322964267944, + "compression/movement_sparsity/model_sparsity": 0.7109404521290716, + "compression_loss": 126.41250610351562, + "distillation_loss": 4.530086994171143, + "epoch": 1.4, + "learning_rate": 4.299239222316145e-05, + "loss": 131.3589, + "step": 1658, + "task_loss": 1.633629560470581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1779117680249265, + "compression/movement_sparsity/importance_threshold": -0.0010113665491815194, + "compression/movement_sparsity/linear_layer_sparsity": 0.7373045652772817, + "compression/movement_sparsity/model_sparsity": 0.71197588524586, + "compression_loss": 126.55818939208984, + "distillation_loss": 5.269854545593262, + "epoch": 1.4, + "learning_rate": 4.298816568047337e-05, + "loss": 131.2301, + "step": 1659, + "task_loss": 1.6925129890441895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1792727024127396, + "compression/movement_sparsity/importance_threshold": -0.0010070931750596995, + "compression/movement_sparsity/linear_layer_sparsity": 0.7382474807572552, + "compression/movement_sparsity/model_sparsity": 0.7128864086783491, + "compression_loss": 126.70355224609375, + "distillation_loss": 4.663570404052734, + "epoch": 1.4, + "learning_rate": 4.29839391377853e-05, + "loss": 130.4322, + "step": 1660, + "task_loss": 1.6720130443572998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1806297977780305, + "compression/movement_sparsity/importance_threshold": -0.0010028318555818402, + "compression/movement_sparsity/linear_layer_sparsity": 0.7392262879818127, + "compression/movement_sparsity/model_sparsity": 0.7138315908635797, + "compression_loss": 126.84844207763672, + "distillation_loss": 4.284666061401367, + "epoch": 1.4, + "learning_rate": 4.297971259509721e-05, + "loss": 131.4044, + "step": 1661, + "task_loss": 2.589498281478882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1819830595431478, + "compression/movement_sparsity/importance_threshold": -0.0009985825737216084, + "compression/movement_sparsity/linear_layer_sparsity": 0.7402984495147913, + "compression/movement_sparsity/model_sparsity": 0.7148669203495458, + "compression_loss": 126.9928970336914, + "distillation_loss": 6.332771301269531, + "epoch": 1.4, + "learning_rate": 4.297548605240913e-05, + "loss": 132.1842, + "step": 1662, + "task_loss": 3.254516839981079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.183332493130439, + "compression/movement_sparsity/importance_threshold": -0.0009943453124526712, + "compression/movement_sparsity/linear_layer_sparsity": 0.7413892127492819, + "compression/movement_sparsity/model_sparsity": 0.7159202125113514, + "compression_loss": 127.13699340820312, + "distillation_loss": 4.129354000091553, + "epoch": 1.41, + "learning_rate": 4.297125950972105e-05, + "loss": 132.2666, + "step": 1663, + "task_loss": 1.7263877391815186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1846781039622531, + "compression/movement_sparsity/importance_threshold": -0.000990120054748694, + "compression/movement_sparsity/linear_layer_sparsity": 0.7423282528747738, + "compression/movement_sparsity/model_sparsity": 0.7168269937197074, + "compression_loss": 127.28064727783203, + "distillation_loss": 4.1973419189453125, + "epoch": 1.41, + "learning_rate": 4.2967032967032963e-05, + "loss": 131.7189, + "step": 1664, + "task_loss": 2.724881172180176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1860198974609375, + "compression/movement_sparsity/importance_threshold": -0.0009859067835833457, + "compression/movement_sparsity/linear_layer_sparsity": 0.7434288416233963, + "compression/movement_sparsity/model_sparsity": 0.7178897738590077, + "compression_loss": 127.4238510131836, + "distillation_loss": 5.392586708068848, + "epoch": 1.41, + "learning_rate": 4.296280642434489e-05, + "loss": 131.892, + "step": 1665, + "task_loss": 3.000189781188965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1873578790488408, + "compression/movement_sparsity/importance_threshold": -0.0009817054819302915, + "compression/movement_sparsity/linear_layer_sparsity": 0.7442039840647332, + "compression/movement_sparsity/model_sparsity": 0.7186382877728675, + "compression_loss": 127.56666564941406, + "distillation_loss": 4.866158485412598, + "epoch": 1.41, + "learning_rate": 4.295857988165681e-05, + "loss": 131.7309, + "step": 1666, + "task_loss": 2.953197479248047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1886920541483113, + "compression/movement_sparsity/importance_threshold": -0.000977516132763198, + "compression/movement_sparsity/linear_layer_sparsity": 0.7451646546303164, + "compression/movement_sparsity/model_sparsity": 0.7195659563491548, + "compression_loss": 127.70897674560547, + "distillation_loss": 3.580167770385742, + "epoch": 1.41, + "learning_rate": 4.295435333896873e-05, + "loss": 132.094, + "step": 1667, + "task_loss": 2.222722291946411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1900224281816971, + "compression/movement_sparsity/importance_threshold": -0.0009733387190557327, + "compression/movement_sparsity/linear_layer_sparsity": 0.7460860469877072, + "compression/movement_sparsity/model_sparsity": 0.7204556960445347, + "compression_loss": 127.8509292602539, + "distillation_loss": 5.3917012214660645, + "epoch": 1.41, + "learning_rate": 4.295012679628064e-05, + "loss": 132.6568, + "step": 1668, + "task_loss": 2.057342052459717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1913490065713461, + "compression/movement_sparsity/importance_threshold": -0.0009691732237815617, + "compression/movement_sparsity/linear_layer_sparsity": 0.7473162156660287, + "compression/movement_sparsity/model_sparsity": 0.7216436046443143, + "compression_loss": 127.99237060546875, + "distillation_loss": 4.974492073059082, + "epoch": 1.41, + "learning_rate": 4.294590025359256e-05, + "loss": 132.8543, + "step": 1669, + "task_loss": 2.2963058948516846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.192671794739607, + "compression/movement_sparsity/importance_threshold": -0.0009650196299143517, + "compression/movement_sparsity/linear_layer_sparsity": 0.7484738138601182, + "compression/movement_sparsity/model_sparsity": 0.7227614357792483, + "compression_loss": 128.1334228515625, + "distillation_loss": 6.833734512329102, + "epoch": 1.41, + "learning_rate": 4.294167371090448e-05, + "loss": 133.6974, + "step": 1670, + "task_loss": 3.656033515930176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1939907981088276, + "compression/movement_sparsity/importance_threshold": -0.0009608779204277706, + "compression/movement_sparsity/linear_layer_sparsity": 0.7496242217811234, + "compression/movement_sparsity/model_sparsity": 0.7238723236490983, + "compression_loss": 128.27403259277344, + "distillation_loss": 5.081347465515137, + "epoch": 1.41, + "learning_rate": 4.29374471682164e-05, + "loss": 132.7393, + "step": 1671, + "task_loss": 2.9365384578704834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1953060221013563, + "compression/movement_sparsity/importance_threshold": -0.0009567480782954833, + "compression/movement_sparsity/linear_layer_sparsity": 0.7507504951868336, + "compression/movement_sparsity/model_sparsity": 0.7249599060985, + "compression_loss": 128.41429138183594, + "distillation_loss": 4.861852645874023, + "epoch": 1.41, + "learning_rate": 4.293322062552832e-05, + "loss": 133.4224, + "step": 1672, + "task_loss": 1.4553730487823486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1966174721395413, + "compression/movement_sparsity/importance_threshold": -0.0009526300864911574, + "compression/movement_sparsity/linear_layer_sparsity": 0.7518278675810691, + "compression/movement_sparsity/model_sparsity": 0.7260002674366084, + "compression_loss": 128.55416870117188, + "distillation_loss": 5.963460922241211, + "epoch": 1.41, + "learning_rate": 4.292899408284024e-05, + "loss": 133.7726, + "step": 1673, + "task_loss": 3.1765036582946777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1979251536457305, + "compression/movement_sparsity/importance_threshold": -0.000948523927988461, + "compression/movement_sparsity/linear_layer_sparsity": 0.7528275540231569, + "compression/movement_sparsity/model_sparsity": 0.7269656115740153, + "compression_loss": 128.69354248046875, + "distillation_loss": 5.412083148956299, + "epoch": 1.41, + "learning_rate": 4.2924767540152154e-05, + "loss": 133.2293, + "step": 1674, + "task_loss": 2.20062518119812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1992290720422725, + "compression/movement_sparsity/importance_threshold": -0.0009444295857610582, + "compression/movement_sparsity/linear_layer_sparsity": 0.7538527939564884, + "compression/movement_sparsity/model_sparsity": 0.7279556313616299, + "compression_loss": 128.83245849609375, + "distillation_loss": 5.566836833953857, + "epoch": 1.42, + "learning_rate": 4.2920540997464074e-05, + "loss": 133.8009, + "step": 1675, + "task_loss": 3.1525473594665527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2005292327515154, + "compression/movement_sparsity/importance_threshold": -0.0009403470427826171, + "compression/movement_sparsity/linear_layer_sparsity": 0.7547149708973995, + "compression/movement_sparsity/model_sparsity": 0.7287881898722541, + "compression_loss": 128.9710235595703, + "distillation_loss": 4.037566184997559, + "epoch": 1.42, + "learning_rate": 4.2916314454775994e-05, + "loss": 134.0059, + "step": 1676, + "task_loss": 2.0260658264160156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2018256411958068, + "compression/movement_sparsity/importance_threshold": -0.0009362762820268053, + "compression/movement_sparsity/linear_layer_sparsity": 0.7557893264772231, + "compression/movement_sparsity/model_sparsity": 0.7298256380328065, + "compression_loss": 129.10919189453125, + "distillation_loss": 6.758441925048828, + "epoch": 1.42, + "learning_rate": 4.291208791208791e-05, + "loss": 134.5959, + "step": 1677, + "task_loss": 2.210911989212036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2031183027974959, + "compression/movement_sparsity/importance_threshold": -0.0009322172864672873, + "compression/movement_sparsity/linear_layer_sparsity": 0.7571184418985869, + "compression/movement_sparsity/model_sparsity": 0.7311090942506092, + "compression_loss": 129.2469024658203, + "distillation_loss": 7.2887115478515625, + "epoch": 1.42, + "learning_rate": 4.290786136939983e-05, + "loss": 134.6074, + "step": 1678, + "task_loss": 3.240959405899048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2044072229789304, + "compression/movement_sparsity/importance_threshold": -0.0009281700390777318, + "compression/movement_sparsity/linear_layer_sparsity": 0.758223072940038, + "compression/movement_sparsity/model_sparsity": 0.7321757778175438, + "compression_loss": 129.38433837890625, + "distillation_loss": 4.416586875915527, + "epoch": 1.42, + "learning_rate": 4.290363482671175e-05, + "loss": 134.093, + "step": 1679, + "task_loss": 2.084963798522949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2056924071624582, + "compression/movement_sparsity/importance_threshold": -0.0009241345228318048, + "compression/movement_sparsity/linear_layer_sparsity": 0.7592856235939022, + "compression/movement_sparsity/model_sparsity": 0.7332018265876596, + "compression_loss": 129.5212860107422, + "distillation_loss": 4.9988274574279785, + "epoch": 1.42, + "learning_rate": 4.289940828402367e-05, + "loss": 134.1176, + "step": 1680, + "task_loss": 2.9756672382354736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.206973860770428, + "compression/movement_sparsity/importance_threshold": -0.0009201107207031734, + "compression/movement_sparsity/linear_layer_sparsity": 0.7602929534274446, + "compression/movement_sparsity/model_sparsity": 0.7341745515425108, + "compression_loss": 129.65785217285156, + "distillation_loss": 5.318084716796875, + "epoch": 1.42, + "learning_rate": 4.2895181741335585e-05, + "loss": 134.7069, + "step": 1681, + "task_loss": 2.87835693359375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2082515892251877, + "compression/movement_sparsity/importance_threshold": -0.0009160986156655031, + "compression/movement_sparsity/linear_layer_sparsity": 0.761374880853717, + "compression/movement_sparsity/model_sparsity": 0.7352193114332927, + "compression_loss": 129.79391479492188, + "distillation_loss": 7.476860046386719, + "epoch": 1.42, + "learning_rate": 4.289095519864751e-05, + "loss": 136.285, + "step": 1682, + "task_loss": 3.2668232917785645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2095255979490855, + "compression/movement_sparsity/importance_threshold": -0.0009120981906924619, + "compression/movement_sparsity/linear_layer_sparsity": 0.7622706362506907, + "compression/movement_sparsity/model_sparsity": 0.7360842948767143, + "compression_loss": 129.92970275878906, + "distillation_loss": 5.667370796203613, + "epoch": 1.42, + "learning_rate": 4.288672865595943e-05, + "loss": 135.707, + "step": 1683, + "task_loss": 2.8117642402648926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2107958923644702, + "compression/movement_sparsity/importance_threshold": -0.0009081094287577154, + "compression/movement_sparsity/linear_layer_sparsity": 0.763171447494742, + "compression/movement_sparsity/model_sparsity": 0.736954160483313, + "compression_loss": 130.0651092529297, + "distillation_loss": 6.132251739501953, + "epoch": 1.42, + "learning_rate": 4.2882502113271345e-05, + "loss": 135.2298, + "step": 1684, + "task_loss": 3.029622793197632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2120624778936888, + "compression/movement_sparsity/importance_threshold": -0.0009041323128349322, + "compression/movement_sparsity/linear_layer_sparsity": 0.7641220659870082, + "compression/movement_sparsity/model_sparsity": 0.7378721223059254, + "compression_loss": 130.20005798339844, + "distillation_loss": 6.328795909881592, + "epoch": 1.42, + "learning_rate": 4.2878275570583264e-05, + "loss": 136.1438, + "step": 1685, + "task_loss": 3.421241521835327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2133253599590907, + "compression/movement_sparsity/importance_threshold": -0.0009001668258977768, + "compression/movement_sparsity/linear_layer_sparsity": 0.7651221697749633, + "compression/movement_sparsity/model_sparsity": 0.7388378694520851, + "compression_loss": 130.33468627929688, + "distillation_loss": 6.867501258850098, + "epoch": 1.42, + "learning_rate": 4.2874049027895184e-05, + "loss": 136.5501, + "step": 1686, + "task_loss": 5.162953853607178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2145845439830232, + "compression/movement_sparsity/importance_threshold": -0.0008962129509199174, + "compression/movement_sparsity/linear_layer_sparsity": 0.7660799308436435, + "compression/movement_sparsity/model_sparsity": 0.7397627284816384, + "compression_loss": 130.46890258789062, + "distillation_loss": 4.743262767791748, + "epoch": 1.43, + "learning_rate": 4.2869822485207104e-05, + "loss": 135.8861, + "step": 1687, + "task_loss": 1.9818406105041504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.215840035387835, + "compression/movement_sparsity/importance_threshold": -0.0008922706708750206, + "compression/movement_sparsity/linear_layer_sparsity": 0.7670572117747435, + "compression/movement_sparsity/model_sparsity": 0.7407064368062873, + "compression_loss": 130.6027069091797, + "distillation_loss": 4.071943759918213, + "epoch": 1.43, + "learning_rate": 4.2865595942519024e-05, + "loss": 135.8047, + "step": 1688, + "task_loss": 1.5186123847961426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.217091839595874, + "compression/movement_sparsity/importance_threshold": -0.000888339968736752, + "compression/movement_sparsity/linear_layer_sparsity": 0.7681103065636724, + "compression/movement_sparsity/model_sparsity": 0.741723354549518, + "compression_loss": 130.73617553710938, + "distillation_loss": 4.814032554626465, + "epoch": 1.43, + "learning_rate": 4.286136939983094e-05, + "loss": 135.563, + "step": 1689, + "task_loss": 1.6335136890411377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2183399620294888, + "compression/movement_sparsity/importance_threshold": -0.0008844208274787799, + "compression/movement_sparsity/linear_layer_sparsity": 0.769005704235617, + "compression/movement_sparsity/model_sparsity": 0.7425879925568659, + "compression_loss": 130.8692626953125, + "distillation_loss": 3.8926427364349365, + "epoch": 1.43, + "learning_rate": 4.2857142857142856e-05, + "loss": 136.1487, + "step": 1690, + "task_loss": 2.4474847316741943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.219584408111027, + "compression/movement_sparsity/importance_threshold": -0.0008805132300747703, + "compression/movement_sparsity/linear_layer_sparsity": 0.7698828221585757, + "compression/movement_sparsity/model_sparsity": 0.7434349787808407, + "compression_loss": 131.0019073486328, + "distillation_loss": 5.876618385314941, + "epoch": 1.43, + "learning_rate": 4.2852916314454776e-05, + "loss": 136.3073, + "step": 1691, + "task_loss": 3.0356574058532715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2208251832628374, + "compression/movement_sparsity/importance_threshold": -0.0008766171594983894, + "compression/movement_sparsity/linear_layer_sparsity": 0.7708376141095146, + "compression/movement_sparsity/model_sparsity": 0.7443569706909812, + "compression_loss": 131.13417053222656, + "distillation_loss": 7.163200855255127, + "epoch": 1.43, + "learning_rate": 4.2848689771766696e-05, + "loss": 136.3789, + "step": 1692, + "task_loss": 4.30001163482666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2220622929072678, + "compression/movement_sparsity/importance_threshold": -0.0008727325987233046, + "compression/movement_sparsity/linear_layer_sparsity": 0.7717705967578474, + "compression/movement_sparsity/model_sparsity": 0.7452579025151534, + "compression_loss": 131.26600646972656, + "distillation_loss": 8.87173843383789, + "epoch": 1.43, + "learning_rate": 4.2844463229078616e-05, + "loss": 136.8873, + "step": 1693, + "task_loss": 4.384936332702637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2232957424666666, + "compression/movement_sparsity/importance_threshold": -0.000868859530723182, + "compression/movement_sparsity/linear_layer_sparsity": 0.7724888370712261, + "compression/movement_sparsity/model_sparsity": 0.7459514690642018, + "compression_loss": 131.3975067138672, + "distillation_loss": 6.06578254699707, + "epoch": 1.43, + "learning_rate": 4.2840236686390535e-05, + "loss": 136.4462, + "step": 1694, + "task_loss": 1.5085026025772095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2245255373633819, + "compression/movement_sparsity/importance_threshold": -0.0008649979384716894, + "compression/movement_sparsity/linear_layer_sparsity": 0.7732114535541269, + "compression/movement_sparsity/model_sparsity": 0.7466492614478868, + "compression_loss": 131.52853393554688, + "distillation_loss": 4.902538299560547, + "epoch": 1.43, + "learning_rate": 4.2836010143702455e-05, + "loss": 136.909, + "step": 1695, + "task_loss": 2.141193389892578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.225751683019762, + "compression/movement_sparsity/importance_threshold": -0.0008611478049424925, + "compression/movement_sparsity/linear_layer_sparsity": 0.7741124317365251, + "compression/movement_sparsity/model_sparsity": 0.7475192882579865, + "compression_loss": 131.65914916992188, + "distillation_loss": 4.675021171569824, + "epoch": 1.43, + "learning_rate": 4.2831783601014375e-05, + "loss": 136.394, + "step": 1696, + "task_loss": 2.1710503101348877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2269741848581548, + "compression/movement_sparsity/importance_threshold": -0.0008573091131092586, + "compression/movement_sparsity/linear_layer_sparsity": 0.7751762463521586, + "compression/movement_sparsity/model_sparsity": 0.7485465575688964, + "compression_loss": 131.78932189941406, + "distillation_loss": 6.732913017272949, + "epoch": 1.43, + "learning_rate": 4.282755705832629e-05, + "loss": 137.5181, + "step": 1697, + "task_loss": 4.0819196701049805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.228193048300909, + "compression/movement_sparsity/importance_threshold": -0.0008534818459456535, + "compression/movement_sparsity/linear_layer_sparsity": 0.7759912751342375, + "compression/movement_sparsity/model_sparsity": 0.7493335876049891, + "compression_loss": 131.91915893554688, + "distillation_loss": 6.119390487670898, + "epoch": 1.44, + "learning_rate": 4.282333051563821e-05, + "loss": 137.3243, + "step": 1698, + "task_loss": 3.9338090419769287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2294082787703722, + "compression/movement_sparsity/importance_threshold": -0.0008496659864253454, + "compression/movement_sparsity/linear_layer_sparsity": 0.7769348345192634, + "compression/movement_sparsity/model_sparsity": 0.7502447328224111, + "compression_loss": 132.04861450195312, + "distillation_loss": 5.362759113311768, + "epoch": 1.44, + "learning_rate": 4.2819103972950134e-05, + "loss": 136.8099, + "step": 1699, + "task_loss": 3.3708407878875732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.230619881688893, + "compression/movement_sparsity/importance_threshold": -0.0008458615175220003, + "compression/movement_sparsity/linear_layer_sparsity": 0.7778929294646373, + "compression/movement_sparsity/model_sparsity": 0.7511699142589667, + "compression_loss": 132.1775665283203, + "distillation_loss": 6.90968132019043, + "epoch": 1.44, + "learning_rate": 4.281487743026205e-05, + "loss": 138.3985, + "step": 1700, + "task_loss": 2.7389421463012695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2318278624788197, + "compression/movement_sparsity/importance_threshold": -0.0008420684222092841, + "compression/movement_sparsity/linear_layer_sparsity": 0.7787619031811008, + "compression/movement_sparsity/model_sparsity": 0.7520090360549938, + "compression_loss": 132.30618286132812, + "distillation_loss": 4.572964191436768, + "epoch": 1.44, + "learning_rate": 4.281065088757397e-05, + "loss": 137.3438, + "step": 1701, + "task_loss": 2.8707261085510254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2330322265625, + "compression/movement_sparsity/importance_threshold": -0.0008382866834608649, + "compression/movement_sparsity/linear_layer_sparsity": 0.7796568000380047, + "compression/movement_sparsity/model_sparsity": 0.7528731904518383, + "compression_loss": 132.43434143066406, + "distillation_loss": 4.620830535888672, + "epoch": 1.44, + "learning_rate": 4.2806424344885886e-05, + "loss": 137.3701, + "step": 1702, + "task_loss": 1.9425766468048096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2342329793622824, + "compression/movement_sparsity/importance_threshold": -0.0008345162842504088, + "compression/movement_sparsity/linear_layer_sparsity": 0.7804211749559664, + "compression/movement_sparsity/model_sparsity": 0.7536113067398758, + "compression_loss": 132.56214904785156, + "distillation_loss": 5.746803283691406, + "epoch": 1.44, + "learning_rate": 4.28021978021978e-05, + "loss": 137.6664, + "step": 1703, + "task_loss": 2.6319899559020996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2354301263005154, + "compression/movement_sparsity/importance_threshold": -0.0008307572075515819, + "compression/movement_sparsity/linear_layer_sparsity": 0.7814443877807998, + "compression/movement_sparsity/model_sparsity": 0.7545993690564053, + "compression_loss": 132.68948364257812, + "distillation_loss": 4.8597211837768555, + "epoch": 1.44, + "learning_rate": 4.2797971259509726e-05, + "loss": 137.4753, + "step": 1704, + "task_loss": 2.681061267852783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2366236727995465, + "compression/movement_sparsity/importance_threshold": -0.0008270094363380526, + "compression/movement_sparsity/linear_layer_sparsity": 0.7823389865335129, + "compression/movement_sparsity/model_sparsity": 0.755463235589855, + "compression_loss": 132.8164520263672, + "distillation_loss": 5.233499050140381, + "epoch": 1.44, + "learning_rate": 4.2793744716821646e-05, + "loss": 137.7237, + "step": 1705, + "task_loss": 3.111027717590332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2378136242817244, + "compression/movement_sparsity/importance_threshold": -0.0008232729535834859, + "compression/movement_sparsity/linear_layer_sparsity": 0.7831724500787567, + "compression/movement_sparsity/model_sparsity": 0.756268067098286, + "compression_loss": 132.94300842285156, + "distillation_loss": 5.164727210998535, + "epoch": 1.44, + "learning_rate": 4.278951817413356e-05, + "loss": 138.0709, + "step": 1706, + "task_loss": 2.71216082572937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2389999861693972, + "compression/movement_sparsity/importance_threshold": -0.000819547742261549, + "compression/movement_sparsity/linear_layer_sparsity": 0.7840868071772423, + "compression/movement_sparsity/model_sparsity": 0.7571510132175472, + "compression_loss": 133.06924438476562, + "distillation_loss": 6.585019111633301, + "epoch": 1.44, + "learning_rate": 4.278529163144548e-05, + "loss": 138.0081, + "step": 1707, + "task_loss": 3.1112542152404785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.240182763884913, + "compression/movement_sparsity/importance_threshold": -0.0008158337853459096, + "compression/movement_sparsity/linear_layer_sparsity": 0.7851427756905714, + "compression/movement_sparsity/model_sparsity": 0.7581707059639043, + "compression_loss": 133.19509887695312, + "distillation_loss": 6.9932451248168945, + "epoch": 1.44, + "learning_rate": 4.27810650887574e-05, + "loss": 138.4422, + "step": 1708, + "task_loss": 2.9983417987823486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.24136196285062, + "compression/movement_sparsity/importance_threshold": -0.0008121310658102329, + "compression/movement_sparsity/linear_layer_sparsity": 0.7861185184040466, + "compression/movement_sparsity/model_sparsity": 0.7591129289134357, + "compression_loss": 133.32052612304688, + "distillation_loss": 5.459270477294922, + "epoch": 1.44, + "learning_rate": 4.277683854606932e-05, + "loss": 138.876, + "step": 1709, + "task_loss": 2.3245437145233154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2425375884888668, + "compression/movement_sparsity/importance_threshold": -0.0008084395666281864, + "compression/movement_sparsity/linear_layer_sparsity": 0.7869793836865177, + "compression/movement_sparsity/model_sparsity": 0.7599442208251227, + "compression_loss": 133.4455108642578, + "distillation_loss": 5.4145917892456055, + "epoch": 1.45, + "learning_rate": 4.277261200338124e-05, + "loss": 139.108, + "step": 1710, + "task_loss": 2.6897339820861816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2437096462220008, + "compression/movement_sparsity/importance_threshold": -0.0008047592707734375, + "compression/movement_sparsity/linear_layer_sparsity": 0.7879688153444386, + "compression/movement_sparsity/model_sparsity": 0.7608996624617462, + "compression_loss": 133.57017517089844, + "distillation_loss": 6.585570335388184, + "epoch": 1.45, + "learning_rate": 4.276838546069316e-05, + "loss": 138.994, + "step": 1711, + "task_loss": 3.6023776531219482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2448781414723709, + "compression/movement_sparsity/importance_threshold": -0.0008010901612196513, + "compression/movement_sparsity/linear_layer_sparsity": 0.788922999162828, + "compression/movement_sparsity/model_sparsity": 0.7618210671305612, + "compression_loss": 133.69448852539062, + "distillation_loss": 6.245987892150879, + "epoch": 1.45, + "learning_rate": 4.276415891800508e-05, + "loss": 138.9138, + "step": 1712, + "task_loss": 3.380957841873169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.246043079662325, + "compression/movement_sparsity/importance_threshold": -0.0007974322209404958, + "compression/movement_sparsity/linear_layer_sparsity": 0.7897434176686783, + "compression/movement_sparsity/model_sparsity": 0.762613301736833, + "compression_loss": 133.81842041015625, + "distillation_loss": 7.845611572265625, + "epoch": 1.45, + "learning_rate": 4.275993237531699e-05, + "loss": 139.9407, + "step": 1713, + "task_loss": 4.209351539611816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.247204466214211, + "compression/movement_sparsity/importance_threshold": -0.0007937854329096378, + "compression/movement_sparsity/linear_layer_sparsity": 0.7904920288370254, + "compression/movement_sparsity/model_sparsity": 0.76333619580855, + "compression_loss": 133.9419708251953, + "distillation_loss": 5.992743492126465, + "epoch": 1.45, + "learning_rate": 4.275570583262891e-05, + "loss": 139.5533, + "step": 1714, + "task_loss": 3.66042423248291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2483623065503777, + "compression/movement_sparsity/importance_threshold": -0.0007901497801007431, + "compression/movement_sparsity/linear_layer_sparsity": 0.7913430328328617, + "compression/movement_sparsity/model_sparsity": 0.7641579651991348, + "compression_loss": 134.06515502929688, + "distillation_loss": 5.880227088928223, + "epoch": 1.45, + "learning_rate": 4.275147928994083e-05, + "loss": 139.4521, + "step": 1715, + "task_loss": 2.656493902206421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2495166060931733, + "compression/movement_sparsity/importance_threshold": -0.000786525245487479, + "compression/movement_sparsity/linear_layer_sparsity": 0.7920941957730829, + "compression/movement_sparsity/model_sparsity": 0.764883323381512, + "compression_loss": 134.18798828125, + "distillation_loss": 6.351490020751953, + "epoch": 1.45, + "learning_rate": 4.274725274725275e-05, + "loss": 140.4867, + "step": 1716, + "task_loss": 2.8093924522399902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2506673702649453, + "compression/movement_sparsity/importance_threshold": -0.0007829118120435121, + "compression/movement_sparsity/linear_layer_sparsity": 0.7927116172491002, + "compression/movement_sparsity/model_sparsity": 0.7654795345304175, + "compression_loss": 134.3104705810547, + "distillation_loss": 6.49608039855957, + "epoch": 1.45, + "learning_rate": 4.274302620456467e-05, + "loss": 140.3823, + "step": 1717, + "task_loss": 3.8049371242523193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2518146044880425, + "compression/movement_sparsity/importance_threshold": -0.000779309462742509, + "compression/movement_sparsity/linear_layer_sparsity": 0.7936218605097515, + "compression/movement_sparsity/model_sparsity": 0.7663585081348295, + "compression_loss": 134.43263244628906, + "distillation_loss": 4.627270698547363, + "epoch": 1.45, + "learning_rate": 4.273879966187659e-05, + "loss": 139.556, + "step": 1718, + "task_loss": 2.935063600540161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2529583141848128, + "compression/movement_sparsity/importance_threshold": -0.000775718180558136, + "compression/movement_sparsity/linear_layer_sparsity": 0.794475869395832, + "compression/movement_sparsity/model_sparsity": 0.7671831791884346, + "compression_loss": 134.55433654785156, + "distillation_loss": 6.83725643157959, + "epoch": 1.45, + "learning_rate": 4.27345731191885e-05, + "loss": 141.2668, + "step": 1719, + "task_loss": 3.363121509552002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2540985047776043, + "compression/movement_sparsity/importance_threshold": -0.0007721379484640616, + "compression/movement_sparsity/linear_layer_sparsity": 0.7953367704508061, + "compression/movement_sparsity/model_sparsity": 0.7680145056437289, + "compression_loss": 134.67575073242188, + "distillation_loss": 6.054067611694336, + "epoch": 1.45, + "learning_rate": 4.273034657650042e-05, + "loss": 141.2501, + "step": 1720, + "task_loss": 3.6818389892578125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.255235181688766, + "compression/movement_sparsity/importance_threshold": -0.0007685687494339505, + "compression/movement_sparsity/linear_layer_sparsity": 0.7962307372226343, + "compression/movement_sparsity/model_sparsity": 0.7688777619067814, + "compression_loss": 134.79669189453125, + "distillation_loss": 5.380577564239502, + "epoch": 1.45, + "learning_rate": 4.272612003381235e-05, + "loss": 140.2347, + "step": 1721, + "task_loss": 3.1315689086914062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2563683503406449, + "compression/movement_sparsity/importance_threshold": -0.000765010566441471, + "compression/movement_sparsity/linear_layer_sparsity": 0.7969657548398765, + "compression/movement_sparsity/model_sparsity": 0.7695875294076926, + "compression_loss": 134.91725158691406, + "distillation_loss": 4.683897018432617, + "epoch": 1.46, + "learning_rate": 4.272189349112426e-05, + "loss": 139.8576, + "step": 1722, + "task_loss": 2.7050366401672363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.25749801615559, + "compression/movement_sparsity/importance_threshold": -0.0007614633824602887, + "compression/movement_sparsity/linear_layer_sparsity": 0.7977235356931357, + "compression/movement_sparsity/model_sparsity": 0.7703192781574357, + "compression_loss": 135.03762817382812, + "distillation_loss": 6.378039360046387, + "epoch": 1.46, + "learning_rate": 4.271766694843618e-05, + "loss": 140.0228, + "step": 1723, + "task_loss": 2.7763330936431885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2586241845559494, + "compression/movement_sparsity/importance_threshold": -0.0007579271804640702, + "compression/movement_sparsity/linear_layer_sparsity": 0.79850086025715, + "compression/movement_sparsity/model_sparsity": 0.7710698992313461, + "compression_loss": 135.15745544433594, + "distillation_loss": 7.575067520141602, + "epoch": 1.46, + "learning_rate": 4.27134404057481e-05, + "loss": 141.3022, + "step": 1724, + "task_loss": 3.397301197052002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.259746860964071, + "compression/movement_sparsity/importance_threshold": -0.0007544019434264828, + "compression/movement_sparsity/linear_layer_sparsity": 0.7992693609371136, + "compression/movement_sparsity/model_sparsity": 0.7718119995487683, + "compression_loss": 135.27696228027344, + "distillation_loss": 5.840609550476074, + "epoch": 1.46, + "learning_rate": 4.270921386306002e-05, + "loss": 140.381, + "step": 1725, + "task_loss": 2.734457015991211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.260866050802303, + "compression/movement_sparsity/importance_threshold": -0.000750887654321194, + "compression/movement_sparsity/linear_layer_sparsity": 0.8001048635150233, + "compression/movement_sparsity/model_sparsity": 0.7726188000428202, + "compression_loss": 135.3960418701172, + "distillation_loss": 4.727226257324219, + "epoch": 1.46, + "learning_rate": 4.270498732037194e-05, + "loss": 140.6312, + "step": 1726, + "task_loss": 2.4743335247039795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2619817594929943, + "compression/movement_sparsity/importance_threshold": -0.0007473842961218686, + "compression/movement_sparsity/linear_layer_sparsity": 0.8009066445468588, + "compression/movement_sparsity/model_sparsity": 0.7733930374296452, + "compression_loss": 135.51470947265625, + "distillation_loss": 7.124170303344727, + "epoch": 1.46, + "learning_rate": 4.270076077768386e-05, + "loss": 141.5347, + "step": 1727, + "task_loss": 3.0760762691497803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.263093992458492, + "compression/movement_sparsity/importance_threshold": -0.0007438918518021748, + "compression/movement_sparsity/linear_layer_sparsity": 0.8015949075028007, + "compression/movement_sparsity/model_sparsity": 0.7740576564357061, + "compression_loss": 135.63304138183594, + "distillation_loss": 7.86110782623291, + "epoch": 1.46, + "learning_rate": 4.269653423499578e-05, + "loss": 141.6356, + "step": 1728, + "task_loss": 3.749699592590332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2642027551211452, + "compression/movement_sparsity/importance_threshold": -0.0007404103043357783, + "compression/movement_sparsity/linear_layer_sparsity": 0.8023719101142889, + "compression/movement_sparsity/model_sparsity": 0.7748079666171499, + "compression_loss": 135.7509765625, + "distillation_loss": 6.262055397033691, + "epoch": 1.46, + "learning_rate": 4.269230769230769e-05, + "loss": 141.7056, + "step": 1729, + "task_loss": 2.6465249061584473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2653080529033018, + "compression/movement_sparsity/importance_threshold": -0.0007369396366963474, + "compression/movement_sparsity/linear_layer_sparsity": 0.8030577882367037, + "compression/movement_sparsity/model_sparsity": 0.7754702827160519, + "compression_loss": 135.8685302734375, + "distillation_loss": 6.209960460662842, + "epoch": 1.46, + "learning_rate": 4.268808114961961e-05, + "loss": 140.8428, + "step": 1730, + "task_loss": 3.453437566757202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2664098912273096, + "compression/movement_sparsity/importance_threshold": -0.0007334798318575477, + "compression/movement_sparsity/linear_layer_sparsity": 0.8039512422693237, + "compression/movement_sparsity/model_sparsity": 0.7763330438540653, + "compression_loss": 135.98565673828125, + "distillation_loss": 7.214276313781738, + "epoch": 1.46, + "learning_rate": 4.268385460693153e-05, + "loss": 141.6959, + "step": 1731, + "task_loss": 3.2046098709106445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2675082755155174, + "compression/movement_sparsity/importance_threshold": -0.0007300308727930457, + "compression/movement_sparsity/linear_layer_sparsity": 0.8047782667640442, + "compression/movement_sparsity/model_sparsity": 0.7771316575131673, + "compression_loss": 136.10238647460938, + "distillation_loss": 5.1634297370910645, + "epoch": 1.46, + "learning_rate": 4.267962806424345e-05, + "loss": 141.5635, + "step": 1732, + "task_loss": 3.3998639583587646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.268603211190273, + "compression/movement_sparsity/importance_threshold": -0.0007265927424765081, + "compression/movement_sparsity/linear_layer_sparsity": 0.8054772137741879, + "compression/movement_sparsity/model_sparsity": 0.7778065935433001, + "compression_loss": 136.2188262939453, + "distillation_loss": 5.254622936248779, + "epoch": 1.46, + "learning_rate": 4.267540152155537e-05, + "loss": 141.6908, + "step": 1733, + "task_loss": 2.616302013397217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2696947036739248, + "compression/movement_sparsity/importance_threshold": -0.000723165423881603, + "compression/movement_sparsity/linear_layer_sparsity": 0.8064489141948344, + "compression/movement_sparsity/model_sparsity": 0.7787449130651972, + "compression_loss": 136.33480834960938, + "distillation_loss": 7.393130302429199, + "epoch": 1.47, + "learning_rate": 4.267117497886729e-05, + "loss": 143.2797, + "step": 1734, + "task_loss": 3.7298991680145264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2707827583888207, + "compression/movement_sparsity/importance_threshold": -0.0007197488999819953, + "compression/movement_sparsity/linear_layer_sparsity": 0.8070632234630988, + "compression/movement_sparsity/model_sparsity": 0.7793381189202603, + "compression_loss": 136.450439453125, + "distillation_loss": 5.786230087280273, + "epoch": 1.47, + "learning_rate": 4.2666948436179204e-05, + "loss": 141.9069, + "step": 1735, + "task_loss": 1.6120538711547852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2718673807573095, + "compression/movement_sparsity/importance_threshold": -0.0007163431537513523, + "compression/movement_sparsity/linear_layer_sparsity": 0.8076602903849618, + "compression/movement_sparsity/model_sparsity": 0.7799146747565646, + "compression_loss": 136.56582641601562, + "distillation_loss": 4.251001358032227, + "epoch": 1.47, + "learning_rate": 4.2662721893491124e-05, + "loss": 141.724, + "step": 1736, + "task_loss": 2.6781833171844482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2729485762017387, + "compression/movement_sparsity/importance_threshold": -0.0007129481681633423, + "compression/movement_sparsity/linear_layer_sparsity": 0.8082966712875201, + "compression/movement_sparsity/model_sparsity": 0.7805291940173834, + "compression_loss": 136.68081665039062, + "distillation_loss": 5.164517402648926, + "epoch": 1.47, + "learning_rate": 4.265849535080304e-05, + "loss": 141.5867, + "step": 1737, + "task_loss": 1.9678385257720947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2740263501444573, + "compression/movement_sparsity/importance_threshold": -0.0007095639261916285, + "compression/movement_sparsity/linear_layer_sparsity": 0.8089805223014368, + "compression/movement_sparsity/model_sparsity": 0.7811895526452003, + "compression_loss": 136.7954559326172, + "distillation_loss": 4.900043487548828, + "epoch": 1.47, + "learning_rate": 4.265426880811497e-05, + "loss": 142.1821, + "step": 1738, + "task_loss": 3.5264742374420166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2751007080078125, + "compression/movement_sparsity/importance_threshold": -0.0007061904108098815, + "compression/movement_sparsity/linear_layer_sparsity": 0.809836343660998, + "compression/movement_sparsity/model_sparsity": 0.7820159739082462, + "compression_loss": 136.90969848632812, + "distillation_loss": 5.815499305725098, + "epoch": 1.47, + "learning_rate": 4.265004226542688e-05, + "loss": 142.3562, + "step": 1739, + "task_loss": 2.7860569953918457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.276171655214153, + "compression/movement_sparsity/importance_threshold": -0.0007028276049917664, + "compression/movement_sparsity/linear_layer_sparsity": 0.8106320552915068, + "compression/movement_sparsity/model_sparsity": 0.7827843503963516, + "compression_loss": 137.02357482910156, + "distillation_loss": 5.78668212890625, + "epoch": 1.47, + "learning_rate": 4.26458157227388e-05, + "loss": 143.0579, + "step": 1740, + "task_loss": 3.000568151473999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2772391971858275, + "compression/movement_sparsity/importance_threshold": -0.0006994754917109486, + "compression/movement_sparsity/linear_layer_sparsity": 0.811196104193186, + "compression/movement_sparsity/model_sparsity": 0.7833290224830408, + "compression_loss": 137.13705444335938, + "distillation_loss": 5.275325775146484, + "epoch": 1.47, + "learning_rate": 4.264158918005072e-05, + "loss": 143.1385, + "step": 1741, + "task_loss": 3.2993671894073486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2783033393451833, + "compression/movement_sparsity/importance_threshold": -0.0006961340539410974, + "compression/movement_sparsity/linear_layer_sparsity": 0.8119959535098644, + "compression/movement_sparsity/model_sparsity": 0.784101394515067, + "compression_loss": 137.250244140625, + "distillation_loss": 5.988145351409912, + "epoch": 1.47, + "learning_rate": 4.2637362637362635e-05, + "loss": 142.5019, + "step": 1742, + "task_loss": 4.9025983810424805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2793640871145688, + "compression/movement_sparsity/importance_threshold": -0.0006928032746558776, + "compression/movement_sparsity/linear_layer_sparsity": 0.812696021391766, + "compression/movement_sparsity/model_sparsity": 0.7847774129115644, + "compression_loss": 137.36300659179688, + "distillation_loss": 7.043306350708008, + "epoch": 1.47, + "learning_rate": 4.2633136094674555e-05, + "loss": 143.3293, + "step": 1743, + "task_loss": 3.0225682258605957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2804214459163328, + "compression/movement_sparsity/importance_threshold": -0.0006894831368289564, + "compression/movement_sparsity/linear_layer_sparsity": 0.8133931678525966, + "compression/movement_sparsity/model_sparsity": 0.7854506102467923, + "compression_loss": 137.4755401611328, + "distillation_loss": 7.989251136779785, + "epoch": 1.47, + "learning_rate": 4.262890955198648e-05, + "loss": 143.6456, + "step": 1744, + "task_loss": 4.132726669311523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2814754211728228, + "compression/movement_sparsity/importance_threshold": -0.0006861736234340006, + "compression/movement_sparsity/linear_layer_sparsity": 0.8140134392046791, + "compression/movement_sparsity/model_sparsity": 0.7860495733697528, + "compression_loss": 137.5876007080078, + "distillation_loss": 7.679581642150879, + "epoch": 1.47, + "learning_rate": 4.2624683009298394e-05, + "loss": 143.6763, + "step": 1745, + "task_loss": 3.0121266841888428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2825260183063873, + "compression/movement_sparsity/importance_threshold": -0.0006828747174446774, + "compression/movement_sparsity/linear_layer_sparsity": 0.8147610248946094, + "compression/movement_sparsity/model_sparsity": 0.7867714771913915, + "compression_loss": 137.69935607910156, + "distillation_loss": 6.29749059677124, + "epoch": 1.48, + "learning_rate": 4.2620456466610314e-05, + "loss": 143.7032, + "step": 1746, + "task_loss": 3.543337821960449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2835732427393747, + "compression/movement_sparsity/importance_threshold": -0.0006795864018346526, + "compression/movement_sparsity/linear_layer_sparsity": 0.8155494031620222, + "compression/movement_sparsity/model_sparsity": 0.7875327722399833, + "compression_loss": 137.81072998046875, + "distillation_loss": 6.114058494567871, + "epoch": 1.48, + "learning_rate": 4.2616229923922234e-05, + "loss": 143.8729, + "step": 1747, + "task_loss": 2.6716294288635254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2846170998941326, + "compression/movement_sparsity/importance_threshold": -0.0006763086595775935, + "compression/movement_sparsity/linear_layer_sparsity": 0.8163544871882927, + "compression/movement_sparsity/model_sparsity": 0.7883101991532233, + "compression_loss": 137.9217529296875, + "distillation_loss": 6.309618949890137, + "epoch": 1.48, + "learning_rate": 4.261200338123415e-05, + "loss": 143.4778, + "step": 1748, + "task_loss": 3.4192652702331543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2856575951930098, + "compression/movement_sparsity/importance_threshold": -0.0006730414736471667, + "compression/movement_sparsity/linear_layer_sparsity": 0.8170422374050264, + "compression/movement_sparsity/model_sparsity": 0.7889743230342451, + "compression_loss": 138.03240966796875, + "distillation_loss": 4.761305332183838, + "epoch": 1.48, + "learning_rate": 4.2607776838546073e-05, + "loss": 143.5089, + "step": 1749, + "task_loss": 2.7217016220092773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.286694734058354, + "compression/movement_sparsity/importance_threshold": -0.0006697848270170396, + "compression/movement_sparsity/linear_layer_sparsity": 0.8177118390386183, + "compression/movement_sparsity/model_sparsity": 0.7896209217917876, + "compression_loss": 138.14260864257812, + "distillation_loss": 6.801706790924072, + "epoch": 1.48, + "learning_rate": 4.260355029585799e-05, + "loss": 144.1379, + "step": 1750, + "task_loss": 3.7426958084106445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2877285219125139, + "compression/movement_sparsity/importance_threshold": -0.0006665387026608778, + "compression/movement_sparsity/linear_layer_sparsity": 0.8184638724430768, + "compression/movement_sparsity/model_sparsity": 0.7903471205352776, + "compression_loss": 138.25244140625, + "distillation_loss": 7.121084213256836, + "epoch": 1.48, + "learning_rate": 4.2599323753169906e-05, + "loss": 143.5769, + "step": 1751, + "task_loss": 3.386566162109375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2887589641778374, + "compression/movement_sparsity/importance_threshold": -0.0006633030835523478, + "compression/movement_sparsity/linear_layer_sparsity": 0.8191650135000657, + "compression/movement_sparsity/model_sparsity": 0.7910241752399966, + "compression_loss": 138.36195373535156, + "distillation_loss": 5.25375509262085, + "epoch": 1.48, + "learning_rate": 4.2595097210481826e-05, + "loss": 143.9507, + "step": 1752, + "task_loss": 2.1418049335479736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2897860662766727, + "compression/movement_sparsity/importance_threshold": -0.000660077952665118, + "compression/movement_sparsity/linear_layer_sparsity": 0.8199223412349546, + "compression/movement_sparsity/model_sparsity": 0.7917554864373795, + "compression_loss": 138.47105407714844, + "distillation_loss": 4.255082130432129, + "epoch": 1.48, + "learning_rate": 4.2590870667793746e-05, + "loss": 144.2206, + "step": 1753, + "task_loss": 2.147740602493286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2908098336313678, + "compression/movement_sparsity/importance_threshold": -0.000656863292972854, + "compression/movement_sparsity/linear_layer_sparsity": 0.8206230768702437, + "compression/movement_sparsity/model_sparsity": 0.7924321496478814, + "compression_loss": 138.57981872558594, + "distillation_loss": 5.833047389984131, + "epoch": 1.48, + "learning_rate": 4.2586644125105665e-05, + "loss": 143.9094, + "step": 1754, + "task_loss": 3.3217697143554688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2918302716642716, + "compression/movement_sparsity/importance_threshold": -0.0006536590874492224, + "compression/movement_sparsity/linear_layer_sparsity": 0.8212446956532689, + "compression/movement_sparsity/model_sparsity": 0.7930324139133866, + "compression_loss": 138.68817138671875, + "distillation_loss": 5.923211574554443, + "epoch": 1.48, + "learning_rate": 4.2582417582417585e-05, + "loss": 144.1391, + "step": 1755, + "task_loss": 3.254855155944824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2928473857977316, + "compression/movement_sparsity/importance_threshold": -0.0006504653190678895, + "compression/movement_sparsity/linear_layer_sparsity": 0.8219512622065319, + "compression/movement_sparsity/model_sparsity": 0.7937147077318922, + "compression_loss": 138.7961883544922, + "distillation_loss": 4.856620788574219, + "epoch": 1.48, + "learning_rate": 4.2578191039729505e-05, + "loss": 144.2538, + "step": 1756, + "task_loss": 3.082163095474243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2938611814540961, + "compression/movement_sparsity/importance_threshold": -0.0006472819708025239, + "compression/movement_sparsity/linear_layer_sparsity": 0.8227291352822574, + "compression/movement_sparsity/model_sparsity": 0.7944658584744491, + "compression_loss": 138.90382385253906, + "distillation_loss": 6.162832260131836, + "epoch": 1.48, + "learning_rate": 4.2573964497041425e-05, + "loss": 144.5456, + "step": 1757, + "task_loss": 3.0379414558410645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2948716640557136, + "compression/movement_sparsity/importance_threshold": -0.000644109025626791, + "compression/movement_sparsity/linear_layer_sparsity": 0.8233315442312212, + "compression/movement_sparsity/model_sparsity": 0.7950475728227893, + "compression_loss": 139.01109313964844, + "distillation_loss": 4.725363731384277, + "epoch": 1.49, + "learning_rate": 4.256973795435334e-05, + "loss": 144.0043, + "step": 1758, + "task_loss": 2.153745174407959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2958788390249318, + "compression/movement_sparsity/importance_threshold": -0.0006409464665143574, + "compression/movement_sparsity/linear_layer_sparsity": 0.8238902988024701, + "compression/movement_sparsity/model_sparsity": 0.7955871324555857, + "compression_loss": 139.1179656982422, + "distillation_loss": 5.50393533706665, + "epoch": 1.49, + "learning_rate": 4.256551141166526e-05, + "loss": 144.2977, + "step": 1759, + "task_loss": 2.245258331298828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2968827117840998, + "compression/movement_sparsity/importance_threshold": -0.0006377942764388897, + "compression/movement_sparsity/linear_layer_sparsity": 0.8245173073092668, + "compression/movement_sparsity/model_sparsity": 0.79619260129127, + "compression_loss": 139.22457885742188, + "distillation_loss": 5.646256446838379, + "epoch": 1.49, + "learning_rate": 4.256128486897718e-05, + "loss": 144.9921, + "step": 1760, + "task_loss": 2.5519707202911377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2978832877555648, + "compression/movement_sparsity/importance_threshold": -0.0006346524383740544, + "compression/movement_sparsity/linear_layer_sparsity": 0.825119275064028, + "compression/movement_sparsity/model_sparsity": 0.7967738896017857, + "compression_loss": 139.33065795898438, + "distillation_loss": 5.85346794128418, + "epoch": 1.49, + "learning_rate": 4.25570583262891e-05, + "loss": 145.2328, + "step": 1761, + "task_loss": 3.5156404972076416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2988805723616752, + "compression/movement_sparsity/importance_threshold": -0.0006315209352935206, + "compression/movement_sparsity/linear_layer_sparsity": 0.8257291604660995, + "compression/movement_sparsity/model_sparsity": 0.7973628235640692, + "compression_loss": 139.4365234375, + "distillation_loss": 5.043483734130859, + "epoch": 1.49, + "learning_rate": 4.2552831783601016e-05, + "loss": 145.1233, + "step": 1762, + "task_loss": 2.149935483932495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2998745710247797, + "compression/movement_sparsity/importance_threshold": -0.0006283997501709522, + "compression/movement_sparsity/linear_layer_sparsity": 0.8262538953870833, + "compression/movement_sparsity/model_sparsity": 0.7978695322262437, + "compression_loss": 139.5419158935547, + "distillation_loss": 6.715606212615967, + "epoch": 1.49, + "learning_rate": 4.2548605240912936e-05, + "loss": 144.7239, + "step": 1763, + "task_loss": 2.6563217639923096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.300865289167226, + "compression/movement_sparsity/importance_threshold": -0.0006252888659800167, + "compression/movement_sparsity/linear_layer_sparsity": 0.8269647188681923, + "compression/movement_sparsity/model_sparsity": 0.7985559367340279, + "compression_loss": 139.64706420898438, + "distillation_loss": 7.84239387512207, + "epoch": 1.49, + "learning_rate": 4.254437869822485e-05, + "loss": 146.0125, + "step": 1764, + "task_loss": 2.8231847286224365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3018527322113624, + "compression/movement_sparsity/importance_threshold": -0.0006221882656943815, + "compression/movement_sparsity/linear_layer_sparsity": 0.8275511375083564, + "compression/movement_sparsity/model_sparsity": 0.7991222100898676, + "compression_loss": 139.75180053710938, + "distillation_loss": 3.9311342239379883, + "epoch": 1.49, + "learning_rate": 4.254015215553677e-05, + "loss": 144.7489, + "step": 1765, + "task_loss": 2.431652784347534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3028369055795375, + "compression/movement_sparsity/importance_threshold": -0.000619097932287713, + "compression/movement_sparsity/linear_layer_sparsity": 0.8281741633431626, + "compression/movement_sparsity/model_sparsity": 0.7997238330705966, + "compression_loss": 139.85621643066406, + "distillation_loss": 7.11520528793335, + "epoch": 1.49, + "learning_rate": 4.2535925612848695e-05, + "loss": 145.8988, + "step": 1766, + "task_loss": 3.0921268463134766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.303817814694099, + "compression/movement_sparsity/importance_threshold": -0.000616017848733677, + "compression/movement_sparsity/linear_layer_sparsity": 0.8288645726492792, + "compression/movement_sparsity/model_sparsity": 0.8003905246931005, + "compression_loss": 139.96034240722656, + "distillation_loss": 4.946022987365723, + "epoch": 1.49, + "learning_rate": 4.2531699070160615e-05, + "loss": 145.0384, + "step": 1767, + "task_loss": 2.514681339263916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3047954649773952, + "compression/movement_sparsity/importance_threshold": -0.0006129479980059426, + "compression/movement_sparsity/linear_layer_sparsity": 0.8296048249761135, + "compression/movement_sparsity/model_sparsity": 0.8011053470752256, + "compression_loss": 140.06410217285156, + "distillation_loss": 7.6344475746154785, + "epoch": 1.49, + "learning_rate": 4.252747252747253e-05, + "loss": 145.7643, + "step": 1768, + "task_loss": 4.450456619262695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3057698618517743, + "compression/movement_sparsity/importance_threshold": -0.0006098883630781737, + "compression/movement_sparsity/linear_layer_sparsity": 0.8301827297605856, + "compression/movement_sparsity/model_sparsity": 0.801663399052508, + "compression_loss": 140.16763305664062, + "distillation_loss": 6.687248229980469, + "epoch": 1.5, + "learning_rate": 4.252324598478445e-05, + "loss": 145.8839, + "step": 1769, + "task_loss": 3.056523323059082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3067410107395847, + "compression/movement_sparsity/importance_threshold": -0.0006068389269240386, + "compression/movement_sparsity/linear_layer_sparsity": 0.8309462819109804, + "compression/movement_sparsity/model_sparsity": 0.8024007208375756, + "compression_loss": 140.27076721191406, + "distillation_loss": 5.196033477783203, + "epoch": 1.5, + "learning_rate": 4.251901944209637e-05, + "loss": 146.5131, + "step": 1770, + "task_loss": 3.0600903034210205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3077089170631744, + "compression/movement_sparsity/importance_threshold": -0.0006037996725172037, + "compression/movement_sparsity/linear_layer_sparsity": 0.8313825633564415, + "compression/movement_sparsity/model_sparsity": 0.8028220146732263, + "compression_loss": 140.3734130859375, + "distillation_loss": 5.396224021911621, + "epoch": 1.5, + "learning_rate": 4.251479289940829e-05, + "loss": 145.3328, + "step": 1771, + "task_loss": 1.6985580921173096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3086735862448915, + "compression/movement_sparsity/importance_threshold": -0.0006007705828313366, + "compression/movement_sparsity/linear_layer_sparsity": 0.8321750673858561, + "compression/movement_sparsity/model_sparsity": 0.8035872937512031, + "compression_loss": 140.47586059570312, + "distillation_loss": 5.913259506225586, + "epoch": 1.5, + "learning_rate": 4.251056635672021e-05, + "loss": 146.2286, + "step": 1772, + "task_loss": 3.681518316268921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3096350237070846, + "compression/movement_sparsity/importance_threshold": -0.000597751640840102, + "compression/movement_sparsity/linear_layer_sparsity": 0.8328959429402822, + "compression/movement_sparsity/model_sparsity": 0.804283405012662, + "compression_loss": 140.57789611816406, + "distillation_loss": 5.118448734283447, + "epoch": 1.5, + "learning_rate": 4.250633981403213e-05, + "loss": 146.3774, + "step": 1773, + "task_loss": 2.3396239280700684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3105932348721014, + "compression/movement_sparsity/importance_threshold": -0.0005947428295171682, + "compression/movement_sparsity/linear_layer_sparsity": 0.8334564384400059, + "compression/movement_sparsity/model_sparsity": 0.8048246457676844, + "compression_loss": 140.67962646484375, + "distillation_loss": 5.340882778167725, + "epoch": 1.5, + "learning_rate": 4.250211327134404e-05, + "loss": 146.7904, + "step": 1774, + "task_loss": 4.274032115936279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3115482251622907, + "compression/movement_sparsity/importance_threshold": -0.0005917441318362008, + "compression/movement_sparsity/linear_layer_sparsity": 0.8340619476725551, + "compression/movement_sparsity/model_sparsity": 0.8054093538953312, + "compression_loss": 140.781005859375, + "distillation_loss": 6.783961772918701, + "epoch": 1.5, + "learning_rate": 4.249788672865596e-05, + "loss": 146.5327, + "step": 1775, + "task_loss": 3.5391533374786377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3125, + "compression/movement_sparsity/importance_threshold": -0.0005887555307708681, + "compression/movement_sparsity/linear_layer_sparsity": 0.834637336457656, + "compression/movement_sparsity/model_sparsity": 0.805964976305561, + "compression_loss": 140.882080078125, + "distillation_loss": 6.131011486053467, + "epoch": 1.5, + "learning_rate": 4.249366018596788e-05, + "loss": 146.594, + "step": 1776, + "task_loss": 2.7621350288391113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.313448564807578, + "compression/movement_sparsity/importance_threshold": -0.0005857770092948349, + "compression/movement_sparsity/linear_layer_sparsity": 0.8352930106634492, + "compression/movement_sparsity/model_sparsity": 0.8065981260852954, + "compression_loss": 140.9827423095703, + "distillation_loss": 7.761905670166016, + "epoch": 1.5, + "learning_rate": 4.24894336432798e-05, + "loss": 147.1191, + "step": 1777, + "task_loss": 3.130160093307495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3143939250073724, + "compression/movement_sparsity/importance_threshold": -0.0005828085503817712, + "compression/movement_sparsity/linear_layer_sparsity": 0.8360540348903053, + "compression/movement_sparsity/model_sparsity": 0.8073330067887745, + "compression_loss": 141.08303833007812, + "distillation_loss": 6.2124176025390625, + "epoch": 1.5, + "learning_rate": 4.248520710059172e-05, + "loss": 146.2784, + "step": 1778, + "task_loss": 2.9587185382843018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.315336086021732, + "compression/movement_sparsity/importance_threshold": -0.0005798501370053392, + "compression/movement_sparsity/linear_layer_sparsity": 0.8366004002513805, + "compression/movement_sparsity/model_sparsity": 0.8078606028188804, + "compression_loss": 141.1830291748047, + "distillation_loss": 3.60258412361145, + "epoch": 1.5, + "learning_rate": 4.248098055790364e-05, + "loss": 146.1561, + "step": 1779, + "task_loss": 1.6228842735290527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3162750532730045, + "compression/movement_sparsity/importance_threshold": -0.0005769017521392089, + "compression/movement_sparsity/linear_layer_sparsity": 0.83709272528473, + "compression/movement_sparsity/model_sparsity": 0.8083360149727654, + "compression_loss": 141.28269958496094, + "distillation_loss": 5.322476387023926, + "epoch": 1.5, + "learning_rate": 4.247675401521555e-05, + "loss": 146.8022, + "step": 1780, + "task_loss": 3.44547176361084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3172108321835385, + "compression/movement_sparsity/importance_threshold": -0.0005739633787570459, + "compression/movement_sparsity/linear_layer_sparsity": 0.8376807894600279, + "compression/movement_sparsity/model_sparsity": 0.8089038773345448, + "compression_loss": 141.38189697265625, + "distillation_loss": 7.39438533782959, + "epoch": 1.51, + "learning_rate": 4.247252747252747e-05, + "loss": 147.0786, + "step": 1781, + "task_loss": 3.7741708755493164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3181434281756819, + "compression/movement_sparsity/importance_threshold": -0.0005710349998325169, + "compression/movement_sparsity/linear_layer_sparsity": 0.8382278106503231, + "compression/movement_sparsity/model_sparsity": 0.8094321066641194, + "compression_loss": 141.48077392578125, + "distillation_loss": 4.936664581298828, + "epoch": 1.51, + "learning_rate": 4.246830092983939e-05, + "loss": 147.7109, + "step": 1782, + "task_loss": 2.7555670738220215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.319072846671783, + "compression/movement_sparsity/importance_threshold": -0.0005681165983392892, + "compression/movement_sparsity/linear_layer_sparsity": 0.838723498298946, + "compression/movement_sparsity/model_sparsity": 0.8099107659170984, + "compression_loss": 141.5792999267578, + "distillation_loss": 6.903846263885498, + "epoch": 1.51, + "learning_rate": 4.246407438715132e-05, + "loss": 147.6862, + "step": 1783, + "task_loss": 3.432710886001587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3199990930941898, + "compression/movement_sparsity/importance_threshold": -0.0005652081572510284, + "compression/movement_sparsity/linear_layer_sparsity": 0.8394581343428238, + "compression/movement_sparsity/model_sparsity": 0.8106201649528643, + "compression_loss": 141.677490234375, + "distillation_loss": 7.227783203125, + "epoch": 1.51, + "learning_rate": 4.245984784446323e-05, + "loss": 147.7478, + "step": 1784, + "task_loss": 4.365713119506836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.320922172865251, + "compression/movement_sparsity/importance_threshold": -0.0005623096595414028, + "compression/movement_sparsity/linear_layer_sparsity": 0.8399994557809891, + "compression/movement_sparsity/model_sparsity": 0.8111428903343291, + "compression_loss": 141.775390625, + "distillation_loss": 5.524361610412598, + "epoch": 1.51, + "learning_rate": 4.245562130177515e-05, + "loss": 147.1765, + "step": 1785, + "task_loss": 1.9421035051345825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3218420914073143, + "compression/movement_sparsity/importance_threshold": -0.0005594210881840773, + "compression/movement_sparsity/linear_layer_sparsity": 0.8406164599111392, + "compression/movement_sparsity/model_sparsity": 0.8117386984744818, + "compression_loss": 141.87286376953125, + "distillation_loss": 5.094821929931641, + "epoch": 1.51, + "learning_rate": 4.245139475908707e-05, + "loss": 146.8701, + "step": 1786, + "task_loss": 3.685307741165161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.322758854142728, + "compression/movement_sparsity/importance_threshold": -0.00055654242615272, + "compression/movement_sparsity/linear_layer_sparsity": 0.8412501936484824, + "compression/movement_sparsity/model_sparsity": 0.8123506615083542, + "compression_loss": 141.97003173828125, + "distillation_loss": 4.70139217376709, + "epoch": 1.51, + "learning_rate": 4.244716821639898e-05, + "loss": 148.1169, + "step": 1787, + "task_loss": 2.0995371341705322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3236724664938404, + "compression/movement_sparsity/importance_threshold": -0.0005536736564209976, + "compression/movement_sparsity/linear_layer_sparsity": 0.8417558022045782, + "compression/movement_sparsity/model_sparsity": 0.8128389008551143, + "compression_loss": 142.0667266845703, + "distillation_loss": 5.7059197425842285, + "epoch": 1.51, + "learning_rate": 4.244294167371091e-05, + "loss": 148.1606, + "step": 1788, + "task_loss": 4.309201717376709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3245829338829997, + "compression/movement_sparsity/importance_threshold": -0.0005508147619625757, + "compression/movement_sparsity/linear_layer_sparsity": 0.8422418193532484, + "compression/movement_sparsity/model_sparsity": 0.813308221819564, + "compression_loss": 142.16311645507812, + "distillation_loss": 6.39350700378418, + "epoch": 1.51, + "learning_rate": 4.243871513102283e-05, + "loss": 148.4218, + "step": 1789, + "task_loss": 3.282238006591797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3254902617325541, + "compression/movement_sparsity/importance_threshold": -0.0005479657257511216, + "compression/movement_sparsity/linear_layer_sparsity": 0.84278895978522, + "compression/movement_sparsity/model_sparsity": 0.8138365662944965, + "compression_loss": 142.25926208496094, + "distillation_loss": 10.267997741699219, + "epoch": 1.51, + "learning_rate": 4.243448858833474e-05, + "loss": 149.5215, + "step": 1790, + "task_loss": 4.538952350616455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3263944554648517, + "compression/movement_sparsity/importance_threshold": -0.0005451265307603028, + "compression/movement_sparsity/linear_layer_sparsity": 0.8432978236390805, + "compression/movement_sparsity/model_sparsity": 0.8143279491095285, + "compression_loss": 142.35498046875, + "distillation_loss": 6.726561546325684, + "epoch": 1.51, + "learning_rate": 4.243026204564666e-05, + "loss": 148.6764, + "step": 1791, + "task_loss": 3.5487115383148193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3272955205022408, + "compression/movement_sparsity/importance_threshold": -0.0005422971599637859, + "compression/movement_sparsity/linear_layer_sparsity": 0.8439652550741626, + "compression/movement_sparsity/model_sparsity": 0.8149724522215563, + "compression_loss": 142.4505157470703, + "distillation_loss": 6.282662868499756, + "epoch": 1.51, + "learning_rate": 4.242603550295858e-05, + "loss": 148.6727, + "step": 1792, + "task_loss": 3.161579132080078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3281934622670695, + "compression/movement_sparsity/importance_threshold": -0.0005394775963352355, + "compression/movement_sparsity/linear_layer_sparsity": 0.8444908485352162, + "compression/movement_sparsity/model_sparsity": 0.8154799899303081, + "compression_loss": 142.545654296875, + "distillation_loss": 5.848675727844238, + "epoch": 1.52, + "learning_rate": 4.24218089602705e-05, + "loss": 149.2476, + "step": 1793, + "task_loss": 1.8755099773406982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.329088286181686, + "compression/movement_sparsity/importance_threshold": -0.000536667822848321, + "compression/movement_sparsity/linear_layer_sparsity": 0.8450572464979197, + "compression/movement_sparsity/model_sparsity": 0.8160269303805489, + "compression_loss": 142.64051818847656, + "distillation_loss": 6.196131706237793, + "epoch": 1.52, + "learning_rate": 4.241758241758242e-05, + "loss": 148.3279, + "step": 1794, + "task_loss": 2.8414669036865234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3299799976684386, + "compression/movement_sparsity/importance_threshold": -0.0005338678224767078, + "compression/movement_sparsity/linear_layer_sparsity": 0.8454806617665015, + "compression/movement_sparsity/model_sparsity": 0.8164358000320772, + "compression_loss": 142.7350311279297, + "distillation_loss": 6.473362445831299, + "epoch": 1.52, + "learning_rate": 4.241335587489434e-05, + "loss": 149.2629, + "step": 1795, + "task_loss": 3.872361183166504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3308686021496752, + "compression/movement_sparsity/importance_threshold": -0.0005310775781940625, + "compression/movement_sparsity/linear_layer_sparsity": 0.8459983852769154, + "compression/movement_sparsity/model_sparsity": 0.8169357381472045, + "compression_loss": 142.8291473388672, + "distillation_loss": 7.025053024291992, + "epoch": 1.52, + "learning_rate": 4.240912933220626e-05, + "loss": 149.7013, + "step": 1796, + "task_loss": 3.680112361907959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3317541050477444, + "compression/movement_sparsity/importance_threshold": -0.0005282970729740526, + "compression/movement_sparsity/linear_layer_sparsity": 0.8464915092294966, + "compression/movement_sparsity/model_sparsity": 0.8174119217749878, + "compression_loss": 142.92312622070312, + "distillation_loss": 6.706751346588135, + "epoch": 1.52, + "learning_rate": 4.2404902789518173e-05, + "loss": 148.0347, + "step": 1797, + "task_loss": 2.7960166931152344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3326365117849943, + "compression/movement_sparsity/importance_threshold": -0.0005255262897903446, + "compression/movement_sparsity/linear_layer_sparsity": 0.8469740922178876, + "compression/movement_sparsity/model_sparsity": 0.8178779265531286, + "compression_loss": 143.01654052734375, + "distillation_loss": 5.169501304626465, + "epoch": 1.52, + "learning_rate": 4.240067624683009e-05, + "loss": 148.3957, + "step": 1798, + "task_loss": 3.5008366107940674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3335158277837729, + "compression/movement_sparsity/importance_threshold": -0.000522765211616605, + "compression/movement_sparsity/linear_layer_sparsity": 0.8475131719125373, + "compression/movement_sparsity/model_sparsity": 0.818398487201864, + "compression_loss": 143.10958862304688, + "distillation_loss": 6.25333309173584, + "epoch": 1.52, + "learning_rate": 4.239644970414201e-05, + "loss": 148.8576, + "step": 1799, + "task_loss": 3.4381887912750244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3343920584664284, + "compression/movement_sparsity/importance_threshold": -0.0005200138214265003, + "compression/movement_sparsity/linear_layer_sparsity": 0.8480324336405762, + "compression/movement_sparsity/model_sparsity": 0.8188999106921089, + "compression_loss": 143.20242309570312, + "distillation_loss": 7.622268199920654, + "epoch": 1.52, + "learning_rate": 4.239222316145393e-05, + "loss": 149.2285, + "step": 1800, + "task_loss": 3.2173707485198975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3352652092553092, + "compression/movement_sparsity/importance_threshold": -0.0005172721021936971, + "compression/movement_sparsity/linear_layer_sparsity": 0.8484014031597327, + "compression/movement_sparsity/model_sparsity": 0.8192562049731994, + "compression_loss": 143.29476928710938, + "distillation_loss": 6.684247970581055, + "epoch": 1.52, + "learning_rate": 4.238799661876585e-05, + "loss": 149.5587, + "step": 1801, + "task_loss": 3.334637403488159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3361352855727633, + "compression/movement_sparsity/importance_threshold": -0.0005145400368918637, + "compression/movement_sparsity/linear_layer_sparsity": 0.8489656189997589, + "compression/movement_sparsity/model_sparsity": 0.8198010382633897, + "compression_loss": 143.3867950439453, + "distillation_loss": 6.105194091796875, + "epoch": 1.52, + "learning_rate": 4.238377007607777e-05, + "loss": 149.8265, + "step": 1802, + "task_loss": 3.4169492721557617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3370022928411394, + "compression/movement_sparsity/importance_threshold": -0.0005118176084946647, + "compression/movement_sparsity/linear_layer_sparsity": 0.8496480510377269, + "compression/movement_sparsity/model_sparsity": 0.820460026661447, + "compression_loss": 143.478515625, + "distillation_loss": 5.205389022827148, + "epoch": 1.52, + "learning_rate": 4.2379543533389685e-05, + "loss": 150.1192, + "step": 1803, + "task_loss": 3.669997215270996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.337866236482785, + "compression/movement_sparsity/importance_threshold": -0.0005091047999757677, + "compression/movement_sparsity/linear_layer_sparsity": 0.8500777622668205, + "compression/movement_sparsity/model_sparsity": 0.820874975987875, + "compression_loss": 143.5699462890625, + "distillation_loss": 8.611461639404297, + "epoch": 1.52, + "learning_rate": 4.2375316990701605e-05, + "loss": 150.6223, + "step": 1804, + "task_loss": 3.927917718887329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3387271219200483, + "compression/movement_sparsity/importance_threshold": -0.0005064015943088401, + "compression/movement_sparsity/linear_layer_sparsity": 0.8506815663433978, + "compression/movement_sparsity/model_sparsity": 0.821458037536903, + "compression_loss": 143.66094970703125, + "distillation_loss": 7.84056282043457, + "epoch": 1.53, + "learning_rate": 4.237109044801353e-05, + "loss": 149.9805, + "step": 1805, + "task_loss": 4.580684185028076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.339584954575278, + "compression/movement_sparsity/importance_threshold": -0.0005037079744675465, + "compression/movement_sparsity/linear_layer_sparsity": 0.8511092146914903, + "compression/movement_sparsity/model_sparsity": 0.8218709948486385, + "compression_loss": 143.75173950195312, + "distillation_loss": 5.58791971206665, + "epoch": 1.53, + "learning_rate": 4.2366863905325444e-05, + "loss": 149.2539, + "step": 1806, + "task_loss": 3.2603187561035156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3404397398708223, + "compression/movement_sparsity/importance_threshold": -0.0005010239234255563, + "compression/movement_sparsity/linear_layer_sparsity": 0.8517961540648248, + "compression/movement_sparsity/model_sparsity": 0.8225343357412261, + "compression_loss": 143.84222412109375, + "distillation_loss": 6.129647254943848, + "epoch": 1.53, + "learning_rate": 4.2362637362637364e-05, + "loss": 149.8162, + "step": 1807, + "task_loss": 3.5768771171569824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3412914832290288, + "compression/movement_sparsity/importance_threshold": -0.0004983494241565349, + "compression/movement_sparsity/linear_layer_sparsity": 0.8523260759987301, + "compression/movement_sparsity/model_sparsity": 0.8230460532264714, + "compression_loss": 143.9322509765625, + "distillation_loss": 6.287062644958496, + "epoch": 1.53, + "learning_rate": 4.2358410819949284e-05, + "loss": 149.6768, + "step": 1808, + "task_loss": 3.7027273178100586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3421401900722463, + "compression/movement_sparsity/importance_threshold": -0.000495684459634149, + "compression/movement_sparsity/linear_layer_sparsity": 0.8529144144298836, + "compression/movement_sparsity/model_sparsity": 0.8236141804225741, + "compression_loss": 144.0220947265625, + "distillation_loss": 5.887596130371094, + "epoch": 1.53, + "learning_rate": 4.23541842772612e-05, + "loss": 149.5868, + "step": 1809, + "task_loss": 3.946030855178833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3429858658228226, + "compression/movement_sparsity/importance_threshold": -0.000493029012832066, + "compression/movement_sparsity/linear_layer_sparsity": 0.8534624253260926, + "compression/movement_sparsity/model_sparsity": 0.8241433654586195, + "compression_loss": 144.11154174804688, + "distillation_loss": 7.010804176330566, + "epoch": 1.53, + "learning_rate": 4.234995773457312e-05, + "loss": 150.4849, + "step": 1810, + "task_loss": 3.3192989826202393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3438285159031063, + "compression/movement_sparsity/importance_threshold": -0.0004903830667239515, + "compression/movement_sparsity/linear_layer_sparsity": 0.8539953879227451, + "compression/movement_sparsity/model_sparsity": 0.8246580191504923, + "compression_loss": 144.20071411132812, + "distillation_loss": 6.429386615753174, + "epoch": 1.53, + "learning_rate": 4.234573119188504e-05, + "loss": 150.6865, + "step": 1811, + "task_loss": 3.55954647064209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.344668145735445, + "compression/movement_sparsity/importance_threshold": -0.0004877466042834729, + "compression/movement_sparsity/linear_layer_sparsity": 0.854504239852438, + "compression/movement_sparsity/model_sparsity": 0.8251493904509886, + "compression_loss": 144.2896270751953, + "distillation_loss": 6.856687545776367, + "epoch": 1.53, + "learning_rate": 4.234150464919696e-05, + "loss": 150.9271, + "step": 1812, + "task_loss": 3.9256112575531006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3455047607421875, + "compression/movement_sparsity/importance_threshold": -0.0004851196084842968, + "compression/movement_sparsity/linear_layer_sparsity": 0.8549617463162907, + "compression/movement_sparsity/model_sparsity": 0.8255911801603535, + "compression_loss": 144.37806701660156, + "distillation_loss": 6.978229999542236, + "epoch": 1.53, + "learning_rate": 4.2337278106508876e-05, + "loss": 150.3833, + "step": 1813, + "task_loss": 2.405771255493164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3463383663456816, + "compression/movement_sparsity/importance_threshold": -0.00048250206230008967, + "compression/movement_sparsity/linear_layer_sparsity": 0.8555096260466557, + "compression/movement_sparsity/model_sparsity": 0.8261202385365053, + "compression_loss": 144.4663543701172, + "distillation_loss": 6.950952529907227, + "epoch": 1.53, + "learning_rate": 4.2333051563820795e-05, + "loss": 150.616, + "step": 1814, + "task_loss": 3.5087249279022217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3471689679682757, + "compression/movement_sparsity/importance_threshold": -0.0004798939487045189, + "compression/movement_sparsity/linear_layer_sparsity": 0.8560141018068262, + "compression/movement_sparsity/model_sparsity": 0.8266073840023649, + "compression_loss": 144.5541534423828, + "distillation_loss": 7.087459564208984, + "epoch": 1.53, + "learning_rate": 4.2328825021132715e-05, + "loss": 150.6183, + "step": 1815, + "task_loss": 4.267423152923584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.347996571032318, + "compression/movement_sparsity/importance_threshold": -0.0004772952506712511, + "compression/movement_sparsity/linear_layer_sparsity": 0.8564571084828337, + "compression/movement_sparsity/model_sparsity": 0.8270351720362037, + "compression_loss": 144.64161682128906, + "distillation_loss": 6.650256633758545, + "epoch": 1.53, + "learning_rate": 4.2324598478444635e-05, + "loss": 150.562, + "step": 1816, + "task_loss": 3.4272704124450684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3488211809601565, + "compression/movement_sparsity/importance_threshold": -0.0004747059511739527, + "compression/movement_sparsity/linear_layer_sparsity": 0.8570149806656776, + "compression/movement_sparsity/model_sparsity": 0.8275738795933513, + "compression_loss": 144.72889709472656, + "distillation_loss": 5.877427577972412, + "epoch": 1.54, + "learning_rate": 4.2320371935756555e-05, + "loss": 150.7755, + "step": 1817, + "task_loss": 2.9719197750091553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3496428031741394, + "compression/movement_sparsity/importance_threshold": -0.0004721260331862903, + "compression/movement_sparsity/linear_layer_sparsity": 0.8574862953156528, + "compression/movement_sparsity/model_sparsity": 0.8280290031351663, + "compression_loss": 144.81565856933594, + "distillation_loss": 7.523372650146484, + "epoch": 1.54, + "learning_rate": 4.2316145393068474e-05, + "loss": 151.1628, + "step": 1818, + "task_loss": 3.132990598678589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3504614430966153, + "compression/movement_sparsity/importance_threshold": -0.00046955547968193043, + "compression/movement_sparsity/linear_layer_sparsity": 0.85798281765601, + "compression/movement_sparsity/model_sparsity": 0.828508468405651, + "compression_loss": 144.90219116210938, + "distillation_loss": 5.5037713050842285, + "epoch": 1.54, + "learning_rate": 4.231191885038039e-05, + "loss": 151.1072, + "step": 1819, + "task_loss": 2.1953752040863037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3512771061499318, + "compression/movement_sparsity/importance_threshold": -0.00046699427363454046, + "compression/movement_sparsity/linear_layer_sparsity": 0.8583481264557024, + "compression/movement_sparsity/model_sparsity": 0.8288612277242525, + "compression_loss": 144.9883575439453, + "distillation_loss": 6.959022521972656, + "epoch": 1.54, + "learning_rate": 4.230769230769231e-05, + "loss": 150.8487, + "step": 1820, + "task_loss": 2.932663917541504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3520897977564377, + "compression/movement_sparsity/importance_threshold": -0.00046444239801778694, + "compression/movement_sparsity/linear_layer_sparsity": 0.858797452940557, + "compression/movement_sparsity/model_sparsity": 0.8292951184620624, + "compression_loss": 145.07412719726562, + "distillation_loss": 7.074075222015381, + "epoch": 1.54, + "learning_rate": 4.230346576500423e-05, + "loss": 151.7975, + "step": 1821, + "task_loss": 2.8492836952209473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3528995233384806, + "compression/movement_sparsity/importance_threshold": -0.0004618998358053364, + "compression/movement_sparsity/linear_layer_sparsity": 0.8594212657704271, + "compression/movement_sparsity/model_sparsity": 0.8298975014021538, + "compression_loss": 145.1597137451172, + "distillation_loss": 5.904569625854492, + "epoch": 1.54, + "learning_rate": 4.2299239222316147e-05, + "loss": 150.8993, + "step": 1822, + "task_loss": 3.1637818813323975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3537062883184092, + "compression/movement_sparsity/importance_threshold": -0.0004593665699708545, + "compression/movement_sparsity/linear_layer_sparsity": 0.8598171600601054, + "compression/movement_sparsity/model_sparsity": 0.8302797955050684, + "compression_loss": 145.24488830566406, + "distillation_loss": 6.740392684936523, + "epoch": 1.54, + "learning_rate": 4.2295012679628066e-05, + "loss": 151.5313, + "step": 1823, + "task_loss": 2.514648675918579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3545100981185716, + "compression/movement_sparsity/importance_threshold": -0.00045684258348801035, + "compression/movement_sparsity/linear_layer_sparsity": 0.8601760775059449, + "compression/movement_sparsity/model_sparsity": 0.830626383032484, + "compression_loss": 145.329833984375, + "distillation_loss": 8.003349304199219, + "epoch": 1.54, + "learning_rate": 4.2290786136939986e-05, + "loss": 151.9314, + "step": 1824, + "task_loss": 3.5564355850219727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3553109581613154, + "compression/movement_sparsity/importance_threshold": -0.00045432785933046966, + "compression/movement_sparsity/linear_layer_sparsity": 0.8606724925287935, + "compression/movement_sparsity/model_sparsity": 0.8311057446721466, + "compression_loss": 145.41448974609375, + "distillation_loss": 6.623943328857422, + "epoch": 1.54, + "learning_rate": 4.2286559594251906e-05, + "loss": 151.7437, + "step": 1825, + "task_loss": 3.1305036544799805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.35610887386899, + "compression/movement_sparsity/importance_threshold": -0.00045182238047189807, + "compression/movement_sparsity/linear_layer_sparsity": 0.8610900053343956, + "compression/movement_sparsity/model_sparsity": 0.8315089146284567, + "compression_loss": 145.49874877929688, + "distillation_loss": 5.9034013748168945, + "epoch": 1.54, + "learning_rate": 4.228233305156382e-05, + "loss": 152.0808, + "step": 1826, + "task_loss": 2.798334836959839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3569038506639421, + "compression/movement_sparsity/importance_threshold": -0.000449326129885963, + "compression/movement_sparsity/linear_layer_sparsity": 0.8614956058965295, + "compression/movement_sparsity/model_sparsity": 0.8319005815635079, + "compression_loss": 145.58274841308594, + "distillation_loss": 6.358174800872803, + "epoch": 1.54, + "learning_rate": 4.2278106508875745e-05, + "loss": 151.3871, + "step": 1827, + "task_loss": 2.62842059135437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.357695893968521, + "compression/movement_sparsity/importance_threshold": -0.0004468390905463318, + "compression/movement_sparsity/linear_layer_sparsity": 0.8619283935608731, + "compression/movement_sparsity/model_sparsity": 0.8323185016401707, + "compression_loss": 145.66644287109375, + "distillation_loss": 6.892963409423828, + "epoch": 1.54, + "learning_rate": 4.2273879966187665e-05, + "loss": 152.408, + "step": 1828, + "task_loss": 3.1583328247070312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3584850092050746, + "compression/movement_sparsity/importance_threshold": -0.00044436124542667106, + "compression/movement_sparsity/linear_layer_sparsity": 0.8624150903870985, + "compression/movement_sparsity/model_sparsity": 0.8327884789331608, + "compression_loss": 145.749755859375, + "distillation_loss": 5.919205665588379, + "epoch": 1.55, + "learning_rate": 4.226965342349958e-05, + "loss": 151.6775, + "step": 1829, + "task_loss": 2.1881580352783203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3592712017959512, + "compression/movement_sparsity/importance_threshold": -0.0004418925775006464, + "compression/movement_sparsity/linear_layer_sparsity": 0.8628325435718625, + "compression/movement_sparsity/model_sparsity": 0.8331915913167919, + "compression_loss": 145.8328857421875, + "distillation_loss": 5.807865142822266, + "epoch": 1.55, + "learning_rate": 4.22654268808115e-05, + "loss": 151.2798, + "step": 1830, + "task_loss": 2.874177932739258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3600544771634988, + "compression/movement_sparsity/importance_threshold": -0.00043943306974192527, + "compression/movement_sparsity/linear_layer_sparsity": 0.8632254806679671, + "compression/movement_sparsity/model_sparsity": 0.8335710298148293, + "compression_loss": 145.91561889648438, + "distillation_loss": 5.8735809326171875, + "epoch": 1.55, + "learning_rate": 4.226120033812342e-05, + "loss": 152.3776, + "step": 1831, + "task_loss": 3.535463809967041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3608348407300652, + "compression/movement_sparsity/importance_threshold": -0.000436982705124175, + "compression/movement_sparsity/linear_layer_sparsity": 0.8637796802071791, + "compression/movement_sparsity/model_sparsity": 0.8341061908949522, + "compression_loss": 145.99807739257812, + "distillation_loss": 7.100095272064209, + "epoch": 1.55, + "learning_rate": 4.225697379543534e-05, + "loss": 152.7719, + "step": 1832, + "task_loss": 3.687058687210083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3616122979179996, + "compression/movement_sparsity/importance_threshold": -0.00043454146662106133, + "compression/movement_sparsity/linear_layer_sparsity": 0.8642018553623269, + "compression/movement_sparsity/model_sparsity": 0.8345138630347579, + "compression_loss": 146.08018493652344, + "distillation_loss": 6.087064266204834, + "epoch": 1.55, + "learning_rate": 4.225274725274726e-05, + "loss": 152.1026, + "step": 1833, + "task_loss": 2.3889381885528564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3623868541496493, + "compression/movement_sparsity/importance_threshold": -0.00043210933720625073, + "compression/movement_sparsity/linear_layer_sparsity": 0.8645782774862557, + "compression/movement_sparsity/model_sparsity": 0.83487735390072, + "compression_loss": 146.1621551513672, + "distillation_loss": 7.033417224884033, + "epoch": 1.55, + "learning_rate": 4.224852071005918e-05, + "loss": 152.6333, + "step": 1834, + "task_loss": 3.7293763160705566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3631585148473628, + "compression/movement_sparsity/importance_threshold": -0.0004296862998534115, + "compression/movement_sparsity/linear_layer_sparsity": 0.8650280690136481, + "compression/movement_sparsity/model_sparsity": 0.8353116937054259, + "compression_loss": 146.24349975585938, + "distillation_loss": 6.198794364929199, + "epoch": 1.55, + "learning_rate": 4.224429416737109e-05, + "loss": 152.2939, + "step": 1835, + "task_loss": 3.011843204498291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3639272854334887, + "compression/movement_sparsity/importance_threshold": -0.0004272723375362093, + "compression/movement_sparsity/linear_layer_sparsity": 0.8653568183153688, + "compression/movement_sparsity/model_sparsity": 0.8356291494572814, + "compression_loss": 146.32472229003906, + "distillation_loss": 7.574608325958252, + "epoch": 1.55, + "learning_rate": 4.224006762468301e-05, + "loss": 152.8366, + "step": 1836, + "task_loss": 4.515462398529053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3646931713303745, + "compression/movement_sparsity/importance_threshold": -0.00042486743322831064, + "compression/movement_sparsity/linear_layer_sparsity": 0.8657347428844199, + "compression/movement_sparsity/model_sparsity": 0.8359940911547535, + "compression_loss": 146.40557861328125, + "distillation_loss": 6.0002121925354, + "epoch": 1.55, + "learning_rate": 4.223584108199493e-05, + "loss": 152.0068, + "step": 1837, + "task_loss": 2.5071210861206055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3654561779603689, + "compression/movement_sparsity/importance_threshold": -0.0004224715699033821, + "compression/movement_sparsity/linear_layer_sparsity": 0.8661620931283216, + "compression/movement_sparsity/model_sparsity": 0.836406760603094, + "compression_loss": 146.4863739013672, + "distillation_loss": 7.250365257263184, + "epoch": 1.55, + "learning_rate": 4.223161453930685e-05, + "loss": 153.3079, + "step": 1838, + "task_loss": 4.441134929656982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3662163107458198, + "compression/movement_sparsity/importance_threshold": -0.0004200847305350919, + "compression/movement_sparsity/linear_layer_sparsity": 0.8666021306865879, + "compression/movement_sparsity/model_sparsity": 0.83683168151752, + "compression_loss": 146.5667724609375, + "distillation_loss": 6.808737277984619, + "epoch": 1.55, + "learning_rate": 4.222738799661877e-05, + "loss": 152.6478, + "step": 1839, + "task_loss": 3.1851916313171387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3669735751090755, + "compression/movement_sparsity/importance_threshold": -0.0004177068980971057, + "compression/movement_sparsity/linear_layer_sparsity": 0.8669859934911216, + "compression/movement_sparsity/model_sparsity": 0.8372023574538179, + "compression_loss": 146.6467742919922, + "distillation_loss": 4.923266887664795, + "epoch": 1.56, + "learning_rate": 4.222316145393069e-05, + "loss": 151.9591, + "step": 1840, + "task_loss": 2.7223432064056396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3677279764724841, + "compression/movement_sparsity/importance_threshold": -0.0004153380555630892, + "compression/movement_sparsity/linear_layer_sparsity": 0.867440268505545, + "compression/movement_sparsity/model_sparsity": 0.8376410267239824, + "compression_loss": 146.72659301757812, + "distillation_loss": 7.620291709899902, + "epoch": 1.56, + "learning_rate": 4.221893491124261e-05, + "loss": 153.3653, + "step": 1841, + "task_loss": 4.143599510192871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3684795202583941, + "compression/movement_sparsity/importance_threshold": -0.0004129781859067115, + "compression/movement_sparsity/linear_layer_sparsity": 0.8678263849777619, + "compression/movement_sparsity/model_sparsity": 0.8380138789075455, + "compression_loss": 146.80592346191406, + "distillation_loss": 6.786708831787109, + "epoch": 1.56, + "learning_rate": 4.221470836855452e-05, + "loss": 152.9798, + "step": 1842, + "task_loss": 3.249237060546875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3692282118891534, + "compression/movement_sparsity/importance_threshold": -0.00041062727210163744, + "compression/movement_sparsity/linear_layer_sparsity": 0.8682289448771486, + "compression/movement_sparsity/model_sparsity": 0.8384026096359692, + "compression_loss": 146.8849639892578, + "distillation_loss": 9.060035705566406, + "epoch": 1.56, + "learning_rate": 4.221048182586644e-05, + "loss": 153.3529, + "step": 1843, + "task_loss": 3.531567096710205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3699740567871104, + "compression/movement_sparsity/importance_threshold": -0.0004082852971215335, + "compression/movement_sparsity/linear_layer_sparsity": 0.8686887527053552, + "compression/movement_sparsity/model_sparsity": 0.8388466216507425, + "compression_loss": 146.9639129638672, + "distillation_loss": 5.924881935119629, + "epoch": 1.56, + "learning_rate": 4.220625528317837e-05, + "loss": 154.1292, + "step": 1844, + "task_loss": 2.439697742462158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3707170603746128, + "compression/movement_sparsity/importance_threshold": -0.0004059522439400689, + "compression/movement_sparsity/linear_layer_sparsity": 0.8691704652295087, + "compression/movement_sparsity/model_sparsity": 0.8393117858677703, + "compression_loss": 147.04232788085938, + "distillation_loss": 5.721198081970215, + "epoch": 1.56, + "learning_rate": 4.220202874049028e-05, + "loss": 153.2371, + "step": 1845, + "task_loss": 3.8636693954467773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3714572280740096, + "compression/movement_sparsity/importance_threshold": -0.00040362809553090746, + "compression/movement_sparsity/linear_layer_sparsity": 0.8696560411839763, + "compression/movement_sparsity/model_sparsity": 0.8397806807943956, + "compression_loss": 147.12051391601562, + "distillation_loss": 7.845920562744141, + "epoch": 1.56, + "learning_rate": 4.21978021978022e-05, + "loss": 154.077, + "step": 1846, + "task_loss": 3.2476134300231934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3721945653076486, + "compression/movement_sparsity/importance_threshold": -0.00040131283486771665, + "compression/movement_sparsity/linear_layer_sparsity": 0.8700802315234546, + "compression/movement_sparsity/model_sparsity": 0.8401902988907507, + "compression_loss": 147.19830322265625, + "distillation_loss": 7.6389031410217285, + "epoch": 1.56, + "learning_rate": 4.219357565511412e-05, + "loss": 154.2532, + "step": 1847, + "task_loss": 3.4175102710723877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.372929077497878, + "compression/movement_sparsity/importance_threshold": -0.00039900644492416384, + "compression/movement_sparsity/linear_layer_sparsity": 0.870514366618741, + "compression/movement_sparsity/model_sparsity": 0.8406095201099583, + "compression_loss": 147.27589416503906, + "distillation_loss": 8.966110229492188, + "epoch": 1.56, + "learning_rate": 4.218934911242603e-05, + "loss": 154.0425, + "step": 1848, + "task_loss": 3.729973316192627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3736607700670453, + "compression/movement_sparsity/importance_threshold": -0.00039670890867391645, + "compression/movement_sparsity/linear_layer_sparsity": 0.8709058131938912, + "compression/movement_sparsity/model_sparsity": 0.8409875192910214, + "compression_loss": 147.35313415527344, + "distillation_loss": 5.036543846130371, + "epoch": 1.56, + "learning_rate": 4.218512256973796e-05, + "loss": 153.0647, + "step": 1849, + "task_loss": 3.1140565872192383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3743896484375, + "compression/movement_sparsity/importance_threshold": -0.00039442020909064013, + "compression/movement_sparsity/linear_layer_sparsity": 0.8712969616648504, + "compression/movement_sparsity/model_sparsity": 0.8413652306086898, + "compression_loss": 147.43003845214844, + "distillation_loss": 6.887899398803711, + "epoch": 1.56, + "learning_rate": 4.218089602704988e-05, + "loss": 153.4423, + "step": 1850, + "task_loss": 3.7076354026794434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3751157180315896, + "compression/movement_sparsity/importance_threshold": -0.0003921403291480014, + "compression/movement_sparsity/linear_layer_sparsity": 0.871736951526446, + "compression/movement_sparsity/model_sparsity": 0.8417901054649726, + "compression_loss": 147.50662231445312, + "distillation_loss": 6.127902984619141, + "epoch": 1.56, + "learning_rate": 4.217666948436179e-05, + "loss": 152.6885, + "step": 1851, + "task_loss": 3.397775173187256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.375838984271662, + "compression/movement_sparsity/importance_threshold": -0.0003898692518196677, + "compression/movement_sparsity/linear_layer_sparsity": 0.8721504458875549, + "compression/movement_sparsity/model_sparsity": 0.8421893950227198, + "compression_loss": 147.582763671875, + "distillation_loss": 5.996698379516602, + "epoch": 1.57, + "learning_rate": 4.217244294167371e-05, + "loss": 152.813, + "step": 1852, + "task_loss": 2.6034128665924072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.376559452580066, + "compression/movement_sparsity/importance_threshold": -0.00038760696007930553, + "compression/movement_sparsity/linear_layer_sparsity": 0.8724252621790587, + "compression/movement_sparsity/model_sparsity": 0.8424547705291766, + "compression_loss": 147.65869140625, + "distillation_loss": 5.135795593261719, + "epoch": 1.57, + "learning_rate": 4.216821639898563e-05, + "loss": 153.8605, + "step": 1853, + "task_loss": 3.0036847591400146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3772771283791496, + "compression/movement_sparsity/importance_threshold": -0.00038535343690058056, + "compression/movement_sparsity/linear_layer_sparsity": 0.8728179011709724, + "compression/movement_sparsity/model_sparsity": 0.8428339211638193, + "compression_loss": 147.73423767089844, + "distillation_loss": 6.802160263061523, + "epoch": 1.57, + "learning_rate": 4.216398985629755e-05, + "loss": 154.3701, + "step": 1854, + "task_loss": 3.7523281574249268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.377992017091261, + "compression/movement_sparsity/importance_threshold": -0.00038310866525716105, + "compression/movement_sparsity/linear_layer_sparsity": 0.873237751113431, + "compression/movement_sparsity/model_sparsity": 0.843239347969145, + "compression_loss": 147.8097686767578, + "distillation_loss": 6.210305213928223, + "epoch": 1.57, + "learning_rate": 4.215976331360947e-05, + "loss": 154.1215, + "step": 1855, + "task_loss": 3.5660030841827393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3787041241387479, + "compression/movement_sparsity/importance_threshold": -0.00038087262812271355, + "compression/movement_sparsity/linear_layer_sparsity": 0.8734228261193073, + "compression/movement_sparsity/model_sparsity": 0.8434180650792121, + "compression_loss": 147.88473510742188, + "distillation_loss": 5.39631986618042, + "epoch": 1.57, + "learning_rate": 4.215553677092139e-05, + "loss": 153.3718, + "step": 1856, + "task_loss": 2.5087478160858154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3794134549439594, + "compression/movement_sparsity/importance_threshold": -0.0003786453084709037, + "compression/movement_sparsity/linear_layer_sparsity": 0.8739739134507664, + "compression/movement_sparsity/model_sparsity": 0.8439502208654927, + "compression_loss": 147.95953369140625, + "distillation_loss": 7.3603668212890625, + "epoch": 1.57, + "learning_rate": 4.215131022823331e-05, + "loss": 154.5319, + "step": 1857, + "task_loss": 3.0146098136901855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.380120014929243, + "compression/movement_sparsity/importance_threshold": -0.00037642668927539893, + "compression/movement_sparsity/linear_layer_sparsity": 0.8743660516276394, + "compression/movement_sparsity/model_sparsity": 0.8443288878896319, + "compression_loss": 148.03399658203125, + "distillation_loss": 6.592740535736084, + "epoch": 1.57, + "learning_rate": 4.214708368554522e-05, + "loss": 154.4575, + "step": 1858, + "task_loss": 3.2644922733306885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3808238095169467, + "compression/movement_sparsity/importance_threshold": -0.0003742167535098666, + "compression/movement_sparsity/linear_layer_sparsity": 0.8747877617402493, + "compression/movement_sparsity/model_sparsity": 0.8447361109625416, + "compression_loss": 148.10821533203125, + "distillation_loss": 5.234306335449219, + "epoch": 1.57, + "learning_rate": 4.214285714285714e-05, + "loss": 153.6809, + "step": 1859, + "task_loss": 3.560842752456665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.38152484412942, + "compression/movement_sparsity/importance_threshold": -0.0003720154841479724, + "compression/movement_sparsity/linear_layer_sparsity": 0.8751528201324212, + "compression/movement_sparsity/model_sparsity": 0.8450886284758915, + "compression_loss": 148.18199157714844, + "distillation_loss": 6.694014549255371, + "epoch": 1.57, + "learning_rate": 4.213863060016906e-05, + "loss": 155.3337, + "step": 1860, + "task_loss": 5.197516918182373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3822231241890095, + "compression/movement_sparsity/importance_threshold": -0.0003698228641633829, + "compression/movement_sparsity/linear_layer_sparsity": 0.8755148617101813, + "compression/movement_sparsity/model_sparsity": 0.8454382328116853, + "compression_loss": 148.2556915283203, + "distillation_loss": 6.522852420806885, + "epoch": 1.57, + "learning_rate": 4.213440405748098e-05, + "loss": 154.911, + "step": 1861, + "task_loss": 2.605754852294922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3829186551180643, + "compression/movement_sparsity/importance_threshold": -0.0003676388765297663, + "compression/movement_sparsity/linear_layer_sparsity": 0.8759651063559438, + "compression/movement_sparsity/model_sparsity": 0.8458730101687514, + "compression_loss": 148.3290252685547, + "distillation_loss": 7.264030456542969, + "epoch": 1.57, + "learning_rate": 4.21301775147929e-05, + "loss": 154.6723, + "step": 1862, + "task_loss": 3.095665693283081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3836114423389325, + "compression/movement_sparsity/importance_threshold": -0.0003654635042207874, + "compression/movement_sparsity/linear_layer_sparsity": 0.87637639474604, + "compression/movement_sparsity/model_sparsity": 0.8462701695373767, + "compression_loss": 148.40194702148438, + "distillation_loss": 6.592911243438721, + "epoch": 1.57, + "learning_rate": 4.212595097210482e-05, + "loss": 155.1815, + "step": 1863, + "task_loss": 3.470334768295288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3843014912739622, + "compression/movement_sparsity/importance_threshold": -0.00036329673021011366, + "compression/movement_sparsity/linear_layer_sparsity": 0.8767222910008211, + "compression/movement_sparsity/model_sparsity": 0.8466041831917047, + "compression_loss": 148.4746551513672, + "distillation_loss": 7.3904948234558105, + "epoch": 1.58, + "learning_rate": 4.2121724429416735e-05, + "loss": 155.475, + "step": 1864, + "task_loss": 4.322600364685059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3849888073455014, + "compression/movement_sparsity/importance_threshold": -0.0003611385374714124, + "compression/movement_sparsity/linear_layer_sparsity": 0.8772139959774536, + "compression/movement_sparsity/model_sparsity": 0.8470789965897284, + "compression_loss": 148.54693603515625, + "distillation_loss": 6.1083526611328125, + "epoch": 1.58, + "learning_rate": 4.2117497886728655e-05, + "loss": 155.6581, + "step": 1865, + "task_loss": 3.5356814861297607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3856733959758984, + "compression/movement_sparsity/importance_threshold": -0.00035898890897835024, + "compression/movement_sparsity/linear_layer_sparsity": 0.8775861969460395, + "compression/movement_sparsity/model_sparsity": 0.8474384113100192, + "compression_loss": 148.61912536621094, + "distillation_loss": 7.036923885345459, + "epoch": 1.58, + "learning_rate": 4.211327134404058e-05, + "loss": 154.8982, + "step": 1866, + "task_loss": 2.5222835540771484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3863552625875017, + "compression/movement_sparsity/importance_threshold": -0.00035684782770459365, + "compression/movement_sparsity/linear_layer_sparsity": 0.8780465055892867, + "compression/movement_sparsity/model_sparsity": 0.8478829069352959, + "compression_loss": 148.69093322753906, + "distillation_loss": 7.396998405456543, + "epoch": 1.58, + "learning_rate": 4.2109044801352494e-05, + "loss": 155.8431, + "step": 1867, + "task_loss": 3.3267486095428467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3870344126026593, + "compression/movement_sparsity/importance_threshold": -0.00035471527662380917, + "compression/movement_sparsity/linear_layer_sparsity": 0.8783268427704057, + "compression/movement_sparsity/model_sparsity": 0.8481536136718255, + "compression_loss": 148.76242065429688, + "distillation_loss": 8.70544147491455, + "epoch": 1.58, + "learning_rate": 4.2104818258664414e-05, + "loss": 155.9333, + "step": 1868, + "task_loss": 3.489133596420288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3877108514437193, + "compression/movement_sparsity/importance_threshold": -0.00035259123870966334, + "compression/movement_sparsity/linear_layer_sparsity": 0.8786594197299377, + "compression/movement_sparsity/model_sparsity": 0.848474765589671, + "compression_loss": 148.83370971679688, + "distillation_loss": 6.990805625915527, + "epoch": 1.58, + "learning_rate": 4.2100591715976334e-05, + "loss": 155.0529, + "step": 1869, + "task_loss": 3.1875967979431152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3883845845330298, + "compression/movement_sparsity/importance_threshold": -0.00035047569693582355, + "compression/movement_sparsity/linear_layer_sparsity": 0.8789720952536852, + "compression/movement_sparsity/model_sparsity": 0.8487766997472755, + "compression_loss": 148.90463256835938, + "distillation_loss": 5.877851486206055, + "epoch": 1.58, + "learning_rate": 4.209636517328825e-05, + "loss": 155.1167, + "step": 1870, + "task_loss": 2.9592432975769043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3890556172929394, + "compression/movement_sparsity/importance_threshold": -0.00034836863427595635, + "compression/movement_sparsity/linear_layer_sparsity": 0.8793267199991758, + "compression/movement_sparsity/model_sparsity": 0.8491191420418052, + "compression_loss": 148.975341796875, + "distillation_loss": 6.457624912261963, + "epoch": 1.58, + "learning_rate": 4.2092138630600166e-05, + "loss": 155.3938, + "step": 1871, + "task_loss": 2.821925640106201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3897239551457958, + "compression/movement_sparsity/importance_threshold": -0.00034627003370372826, + "compression/movement_sparsity/linear_layer_sparsity": 0.8797472019225192, + "compression/movement_sparsity/model_sparsity": 0.849525179117528, + "compression_loss": 149.04559326171875, + "distillation_loss": 4.93093729019165, + "epoch": 1.58, + "learning_rate": 4.208791208791209e-05, + "loss": 154.349, + "step": 1872, + "task_loss": 2.7775089740753174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3903896035139476, + "compression/movement_sparsity/importance_threshold": -0.0003441798781928058, + "compression/movement_sparsity/linear_layer_sparsity": 0.8800460811843122, + "compression/movement_sparsity/model_sparsity": 0.8498137909572182, + "compression_loss": 149.11572265625, + "distillation_loss": 7.825369834899902, + "epoch": 1.58, + "learning_rate": 4.208368554522401e-05, + "loss": 155.5625, + "step": 1873, + "task_loss": 3.8700406551361084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.391052567819743, + "compression/movement_sparsity/importance_threshold": -0.00034209815071685554, + "compression/movement_sparsity/linear_layer_sparsity": 0.8803635144509464, + "compression/movement_sparsity/model_sparsity": 0.8501203194146046, + "compression_loss": 149.18551635742188, + "distillation_loss": 6.081006050109863, + "epoch": 1.58, + "learning_rate": 4.2079459002535926e-05, + "loss": 155.2744, + "step": 1874, + "task_loss": 2.857639789581299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3917128534855299, + "compression/movement_sparsity/importance_threshold": -0.00034002483424954485, + "compression/movement_sparsity/linear_layer_sparsity": 0.8807055472754135, + "compression/movement_sparsity/model_sparsity": 0.8504506023593353, + "compression_loss": 149.25497436523438, + "distillation_loss": 7.600732326507568, + "epoch": 1.58, + "learning_rate": 4.2075232459847845e-05, + "loss": 155.9755, + "step": 1875, + "task_loss": 4.325234889984131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3923704659336564, + "compression/movement_sparsity/importance_threshold": -0.00033795991176454026, + "compression/movement_sparsity/linear_layer_sparsity": 0.8810463161381112, + "compression/movement_sparsity/model_sparsity": 0.8507796647632717, + "compression_loss": 149.323974609375, + "distillation_loss": 4.604580402374268, + "epoch": 1.59, + "learning_rate": 4.2071005917159765e-05, + "loss": 154.7705, + "step": 1876, + "task_loss": 1.841259241104126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3930254105864712, + "compression/movement_sparsity/importance_threshold": -0.0003359033662355083, + "compression/movement_sparsity/linear_layer_sparsity": 0.8814063306073732, + "compression/movement_sparsity/model_sparsity": 0.8511273116279805, + "compression_loss": 149.39285278320312, + "distillation_loss": 6.943325042724609, + "epoch": 1.59, + "learning_rate": 4.2066779374471685e-05, + "loss": 155.9174, + "step": 1877, + "task_loss": 2.5194525718688965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3936776928663221, + "compression/movement_sparsity/importance_threshold": -0.0003338551806361164, + "compression/movement_sparsity/linear_layer_sparsity": 0.8817211524812952, + "compression/movement_sparsity/model_sparsity": 0.8514313184020279, + "compression_loss": 149.46141052246094, + "distillation_loss": 6.574117660522461, + "epoch": 1.59, + "learning_rate": 4.2062552831783605e-05, + "loss": 155.6478, + "step": 1878, + "task_loss": 3.3019862174987793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3943273181955573, + "compression/movement_sparsity/importance_threshold": -0.0003318153379400302, + "compression/movement_sparsity/linear_layer_sparsity": 0.8820229770124942, + "compression/movement_sparsity/model_sparsity": 0.8517227743320593, + "compression_loss": 149.5296173095703, + "distillation_loss": 7.748133659362793, + "epoch": 1.59, + "learning_rate": 4.2058326289095524e-05, + "loss": 156.5978, + "step": 1879, + "task_loss": 3.981105089187622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3949742919965253, + "compression/movement_sparsity/importance_threshold": -0.0003297838211209171, + "compression/movement_sparsity/linear_layer_sparsity": 0.882394820256051, + "compression/movement_sparsity/model_sparsity": 0.8520818436162763, + "compression_loss": 149.5975799560547, + "distillation_loss": 6.929703235626221, + "epoch": 1.59, + "learning_rate": 4.205409974640744e-05, + "loss": 156.2494, + "step": 1880, + "task_loss": 2.9411325454711914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.395618619691574, + "compression/movement_sparsity/importance_threshold": -0.0003277606131524436, + "compression/movement_sparsity/linear_layer_sparsity": 0.8827588650939739, + "compression/movement_sparsity/model_sparsity": 0.8524333823940836, + "compression_loss": 149.66529846191406, + "distillation_loss": 5.461925506591797, + "epoch": 1.59, + "learning_rate": 4.204987320371936e-05, + "loss": 155.5635, + "step": 1881, + "task_loss": 3.765636682510376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3962603067030517, + "compression/movement_sparsity/importance_threshold": -0.0003257456970082763, + "compression/movement_sparsity/linear_layer_sparsity": 0.883219245282227, + "compression/movement_sparsity/model_sparsity": 0.852877947106575, + "compression_loss": 149.73281860351562, + "distillation_loss": 4.841838359832764, + "epoch": 1.59, + "learning_rate": 4.204564666103128e-05, + "loss": 155.8555, + "step": 1882, + "task_loss": 1.7235074043273926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3968993584533067, + "compression/movement_sparsity/importance_threshold": -0.0003237390556620817, + "compression/movement_sparsity/linear_layer_sparsity": 0.8835713421041786, + "compression/movement_sparsity/model_sparsity": 0.8532179483195163, + "compression_loss": 149.7999725341797, + "distillation_loss": 6.933751583099365, + "epoch": 1.59, + "learning_rate": 4.20414201183432e-05, + "loss": 155.8288, + "step": 1883, + "task_loss": 3.108952045440674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3975357803646873, + "compression/movement_sparsity/importance_threshold": -0.00032174067208752717, + "compression/movement_sparsity/linear_layer_sparsity": 0.8839133033836399, + "compression/movement_sparsity/model_sparsity": 0.8535481621770321, + "compression_loss": 149.86683654785156, + "distillation_loss": 7.251405715942383, + "epoch": 1.59, + "learning_rate": 4.2037193575655116e-05, + "loss": 156.1975, + "step": 1884, + "task_loss": 3.388500452041626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.398169577859541, + "compression/movement_sparsity/importance_threshold": -0.0003197505292582793, + "compression/movement_sparsity/linear_layer_sparsity": 0.8841942486973086, + "compression/movement_sparsity/model_sparsity": 0.8538194561548873, + "compression_loss": 149.9334259033203, + "distillation_loss": 6.155484199523926, + "epoch": 1.59, + "learning_rate": 4.2032967032967036e-05, + "loss": 156.3457, + "step": 1885, + "task_loss": 3.1280503273010254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3988007563602167, + "compression/movement_sparsity/importance_threshold": -0.00031776861014800543, + "compression/movement_sparsity/linear_layer_sparsity": 0.8845195399904149, + "compression/movement_sparsity/model_sparsity": 0.8541335726913624, + "compression_loss": 149.99986267089844, + "distillation_loss": 6.894721508026123, + "epoch": 1.59, + "learning_rate": 4.2028740490278956e-05, + "loss": 155.4189, + "step": 1886, + "task_loss": 3.0255980491638184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3994293212890625, + "compression/movement_sparsity/importance_threshold": -0.0003157948977303704, + "compression/movement_sparsity/linear_layer_sparsity": 0.8848357331436151, + "compression/movement_sparsity/model_sparsity": 0.8544389036370262, + "compression_loss": 150.06594848632812, + "distillation_loss": 7.085342884063721, + "epoch": 1.59, + "learning_rate": 4.202451394759087e-05, + "loss": 156.295, + "step": 1887, + "task_loss": 3.3206844329833984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4000552780684263, + "compression/movement_sparsity/importance_threshold": -0.00031382937497904245, + "compression/movement_sparsity/linear_layer_sparsity": 0.8852122745092204, + "compression/movement_sparsity/model_sparsity": 0.8548025096483461, + "compression_loss": 150.13168334960938, + "distillation_loss": 7.8692169189453125, + "epoch": 1.6, + "learning_rate": 4.202028740490279e-05, + "loss": 156.692, + "step": 1888, + "task_loss": 3.3173069953918457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4006786321206566, + "compression/movement_sparsity/importance_threshold": -0.00031187202486768813, + "compression/movement_sparsity/linear_layer_sparsity": 0.8855141467370898, + "compression/movement_sparsity/model_sparsity": 0.8550940116365208, + "compression_loss": 150.19728088378906, + "distillation_loss": 5.269217014312744, + "epoch": 1.6, + "learning_rate": 4.2016060862214715e-05, + "loss": 156.0347, + "step": 1889, + "task_loss": 3.3313422203063965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4012993888681013, + "compression/movement_sparsity/importance_threshold": -0.00030992283036997397, + "compression/movement_sparsity/linear_layer_sparsity": 0.8858086379051927, + "compression/movement_sparsity/model_sparsity": 0.8553783861270385, + "compression_loss": 150.26242065429688, + "distillation_loss": 7.1392107009887695, + "epoch": 1.6, + "learning_rate": 4.201183431952663e-05, + "loss": 157.0173, + "step": 1890, + "task_loss": 4.142314434051514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4019175537331088, + "compression/movement_sparsity/importance_threshold": -0.00030798177445956563, + "compression/movement_sparsity/linear_layer_sparsity": 0.8862039598348245, + "compression/movement_sparsity/model_sparsity": 0.855760127532235, + "compression_loss": 150.327392578125, + "distillation_loss": 6.651957035064697, + "epoch": 1.6, + "learning_rate": 4.200760777683855e-05, + "loss": 156.3176, + "step": 1891, + "task_loss": 3.3759889602661133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4025331321380274, + "compression/movement_sparsity/importance_threshold": -0.00030604884011013224, + "compression/movement_sparsity/linear_layer_sparsity": 0.8864758189327545, + "compression/movement_sparsity/model_sparsity": 0.8560226474338147, + "compression_loss": 150.3920135498047, + "distillation_loss": 5.683810234069824, + "epoch": 1.6, + "learning_rate": 4.200338123415047e-05, + "loss": 156.3803, + "step": 1892, + "task_loss": 2.293818473815918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.403146129505205, + "compression/movement_sparsity/importance_threshold": -0.0003041240102953386, + "compression/movement_sparsity/linear_layer_sparsity": 0.886789090664884, + "compression/movement_sparsity/model_sparsity": 0.8563251573182089, + "compression_loss": 150.45632934570312, + "distillation_loss": 5.966579437255859, + "epoch": 1.6, + "learning_rate": 4.199915469146238e-05, + "loss": 156.61, + "step": 1893, + "task_loss": 3.467686891555786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.40375655125699, + "compression/movement_sparsity/importance_threshold": -0.00030220726798885125, + "compression/movement_sparsity/linear_layer_sparsity": 0.8871445143296062, + "compression/movement_sparsity/model_sparsity": 0.8566683710866367, + "compression_loss": 150.5203399658203, + "distillation_loss": 7.3996686935424805, + "epoch": 1.6, + "learning_rate": 4.199492814877431e-05, + "loss": 156.2879, + "step": 1894, + "task_loss": 4.114287376403809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4043644028157305, + "compression/movement_sparsity/importance_threshold": -0.0003002985961643393, + "compression/movement_sparsity/linear_layer_sparsity": 0.8874655725432017, + "compression/movement_sparsity/model_sparsity": 0.8569783999629048, + "compression_loss": 150.5841522216797, + "distillation_loss": 5.302109718322754, + "epoch": 1.6, + "learning_rate": 4.1990701606086227e-05, + "loss": 156.9972, + "step": 1895, + "task_loss": 2.8223211765289307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4049696896037747, + "compression/movement_sparsity/importance_threshold": -0.0002983979777954667, + "compression/movement_sparsity/linear_layer_sparsity": 0.8877548051533772, + "compression/movement_sparsity/model_sparsity": 0.8572576965431371, + "compression_loss": 150.64776611328125, + "distillation_loss": 5.442483901977539, + "epoch": 1.6, + "learning_rate": 4.198647506339814e-05, + "loss": 156.8532, + "step": 1896, + "task_loss": 3.177159070968628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.405572417043471, + "compression/movement_sparsity/importance_threshold": -0.00029650539585590087, + "compression/movement_sparsity/linear_layer_sparsity": 0.8880860227577986, + "compression/movement_sparsity/model_sparsity": 0.8575775358039021, + "compression_loss": 150.71109008789062, + "distillation_loss": 6.852133750915527, + "epoch": 1.6, + "learning_rate": 4.198224852071006e-05, + "loss": 156.6172, + "step": 1897, + "task_loss": 3.7614424228668213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4061725905571674, + "compression/movement_sparsity/importance_threshold": -0.00029462083331930917, + "compression/movement_sparsity/linear_layer_sparsity": 0.8883290492183851, + "compression/movement_sparsity/model_sparsity": 0.8578122135579307, + "compression_loss": 150.77406311035156, + "distillation_loss": 7.884014129638672, + "epoch": 1.6, + "learning_rate": 4.197802197802198e-05, + "loss": 157.232, + "step": 1898, + "task_loss": 3.245591163635254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4067702155672117, + "compression/movement_sparsity/importance_threshold": -0.0002927442731593599, + "compression/movement_sparsity/linear_layer_sparsity": 0.8886465898025282, + "compression/movement_sparsity/model_sparsity": 0.8581188456461393, + "compression_loss": 150.8367462158203, + "distillation_loss": 6.228991508483887, + "epoch": 1.6, + "learning_rate": 4.19737954353339e-05, + "loss": 157.7381, + "step": 1899, + "task_loss": 2.4144697189331055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4073652974959527, + "compression/movement_sparsity/importance_threshold": -0.00029087569834971606, + "compression/movement_sparsity/linear_layer_sparsity": 0.8889545195075567, + "compression/movement_sparsity/model_sparsity": 0.8584161970184975, + "compression_loss": 150.89923095703125, + "distillation_loss": 5.907983779907227, + "epoch": 1.61, + "learning_rate": 4.196956889264582e-05, + "loss": 156.7937, + "step": 1900, + "task_loss": 3.688079357147217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4079578417657388, + "compression/movement_sparsity/importance_threshold": -0.00028901509186404686, + "compression/movement_sparsity/linear_layer_sparsity": 0.8893004157623378, + "compression/movement_sparsity/model_sparsity": 0.8587502106728255, + "compression_loss": 150.96142578125, + "distillation_loss": 5.798238277435303, + "epoch": 1.61, + "learning_rate": 4.196534234995774e-05, + "loss": 157.1843, + "step": 1901, + "task_loss": 3.2603585720062256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4085478537989173, + "compression/movement_sparsity/importance_threshold": -0.0002871624366760179, + "compression/movement_sparsity/linear_layer_sparsity": 0.8897274321295456, + "compression/movement_sparsity/model_sparsity": 0.8591625577141638, + "compression_loss": 151.02337646484375, + "distillation_loss": 8.634401321411133, + "epoch": 1.61, + "learning_rate": 4.196111580726966e-05, + "loss": 158.0282, + "step": 1902, + "task_loss": 3.356994390487671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4091353390178372, + "compression/movement_sparsity/importance_threshold": -0.00028531771575929753, + "compression/movement_sparsity/linear_layer_sparsity": 0.8899948077404446, + "compression/movement_sparsity/model_sparsity": 0.8594207481502848, + "compression_loss": 151.08494567871094, + "distillation_loss": 6.6648406982421875, + "epoch": 1.61, + "learning_rate": 4.195688926458157e-05, + "loss": 156.7914, + "step": 1903, + "task_loss": 3.902116298675537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4097203028448462, + "compression/movement_sparsity/importance_threshold": -0.00028348091208755047, + "compression/movement_sparsity/linear_layer_sparsity": 0.8903204090619096, + "compression/movement_sparsity/model_sparsity": 0.8597351640646905, + "compression_loss": 151.14620971679688, + "distillation_loss": 7.1224775314331055, + "epoch": 1.61, + "learning_rate": 4.195266272189349e-05, + "loss": 157.1437, + "step": 1904, + "task_loss": 3.330153226852417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.410302750702293, + "compression/movement_sparsity/importance_threshold": -0.000281652008634445, + "compression/movement_sparsity/linear_layer_sparsity": 0.8906827487438604, + "compression/movement_sparsity/model_sparsity": 0.8600850562638792, + "compression_loss": 151.20745849609375, + "distillation_loss": 5.975131988525391, + "epoch": 1.61, + "learning_rate": 4.194843617920541e-05, + "loss": 157.2221, + "step": 1905, + "task_loss": 3.449862003326416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4108826880125251, + "compression/movement_sparsity/importance_threshold": -0.0002798309883736468, + "compression/movement_sparsity/linear_layer_sparsity": 0.8909932182965955, + "compression/movement_sparsity/model_sparsity": 0.8603848602323617, + "compression_loss": 151.2682647705078, + "distillation_loss": 6.480030059814453, + "epoch": 1.61, + "learning_rate": 4.194420963651733e-05, + "loss": 157.373, + "step": 1906, + "task_loss": 3.391955852508545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4114601201978914, + "compression/movement_sparsity/importance_threshold": -0.0002780178342788233, + "compression/movement_sparsity/linear_layer_sparsity": 0.8912539163736183, + "compression/movement_sparsity/model_sparsity": 0.8606366025284378, + "compression_loss": 151.32887268066406, + "distillation_loss": 8.780585289001465, + "epoch": 1.61, + "learning_rate": 4.193998309382925e-05, + "loss": 157.9215, + "step": 1907, + "task_loss": 3.4508187770843506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4120350526807395, + "compression/movement_sparsity/importance_threshold": -0.000276212529323641, + "compression/movement_sparsity/linear_layer_sparsity": 0.8915470362624431, + "compression/movement_sparsity/model_sparsity": 0.8609196528473392, + "compression_loss": 151.38922119140625, + "distillation_loss": 8.277044296264648, + "epoch": 1.61, + "learning_rate": 4.193575655114117e-05, + "loss": 157.7565, + "step": 1908, + "task_loss": 3.0446062088012695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.412607490883418, + "compression/movement_sparsity/importance_threshold": -0.0002744150564817664, + "compression/movement_sparsity/linear_layer_sparsity": 0.8918629074631171, + "compression/movement_sparsity/model_sparsity": 0.8612246729005365, + "compression_loss": 151.4491729736328, + "distillation_loss": 6.879632949829102, + "epoch": 1.61, + "learning_rate": 4.193153000845308e-05, + "loss": 157.7787, + "step": 1909, + "task_loss": 2.9869284629821777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.413177440228275, + "compression/movement_sparsity/importance_threshold": -0.00027262539872686606, + "compression/movement_sparsity/linear_layer_sparsity": 0.8921619894357599, + "compression/movement_sparsity/model_sparsity": 0.8615134804873352, + "compression_loss": 151.50900268554688, + "distillation_loss": 6.028229713439941, + "epoch": 1.61, + "learning_rate": 4.1927303465765e-05, + "loss": 157.0191, + "step": 1910, + "task_loss": 2.8621063232421875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4137449061376586, + "compression/movement_sparsity/importance_threshold": -0.0002708435390326074, + "compression/movement_sparsity/linear_layer_sparsity": 0.892503068326816, + "compression/movement_sparsity/model_sparsity": 0.8618428422692023, + "compression_loss": 151.56849670410156, + "distillation_loss": 7.064677715301514, + "epoch": 1.61, + "learning_rate": 4.192307692307693e-05, + "loss": 158.5267, + "step": 1911, + "task_loss": 3.214648723602295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4143098940339172, + "compression/movement_sparsity/importance_threshold": -0.00026906946037265605, + "compression/movement_sparsity/linear_layer_sparsity": 0.8926854484708066, + "compression/movement_sparsity/model_sparsity": 0.8620189570941797, + "compression_loss": 151.6277313232422, + "distillation_loss": 6.206345081329346, + "epoch": 1.62, + "learning_rate": 4.191885038038885e-05, + "loss": 157.3912, + "step": 1912, + "task_loss": 3.416571617126465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4148724093393985, + "compression/movement_sparsity/importance_threshold": -0.0002673031457206803, + "compression/movement_sparsity/linear_layer_sparsity": 0.8929668707511806, + "compression/movement_sparsity/model_sparsity": 0.8622907116534667, + "compression_loss": 151.686767578125, + "distillation_loss": 6.52351188659668, + "epoch": 1.62, + "learning_rate": 4.191462383770076e-05, + "loss": 158.6139, + "step": 1913, + "task_loss": 3.0527193546295166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4154324574764514, + "compression/movement_sparsity/importance_threshold": -0.0002655445780503458, + "compression/movement_sparsity/linear_layer_sparsity": 0.8932720936701558, + "compression/movement_sparsity/model_sparsity": 0.8625854492261996, + "compression_loss": 151.7454376220703, + "distillation_loss": 5.9349365234375, + "epoch": 1.62, + "learning_rate": 4.191039729501268e-05, + "loss": 157.3392, + "step": 1914, + "task_loss": 3.1626975536346436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4159900438674236, + "compression/movement_sparsity/importance_threshold": -0.0002637937403353191, + "compression/movement_sparsity/linear_layer_sparsity": 0.893549151705175, + "compression/movement_sparsity/model_sparsity": 0.8628529894653858, + "compression_loss": 151.8039093017578, + "distillation_loss": 6.987100124359131, + "epoch": 1.62, + "learning_rate": 4.19061707523246e-05, + "loss": 158.3103, + "step": 1915, + "task_loss": 3.1438724994659424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4165451739346633, + "compression/movement_sparsity/importance_threshold": -0.00026205061554926846, + "compression/movement_sparsity/linear_layer_sparsity": 0.8937929413124903, + "compression/movement_sparsity/model_sparsity": 0.8630884041497051, + "compression_loss": 151.862060546875, + "distillation_loss": 8.266741752624512, + "epoch": 1.62, + "learning_rate": 4.190194420963652e-05, + "loss": 158.4448, + "step": 1916, + "task_loss": 4.059313774108887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.417097853100519, + "compression/movement_sparsity/importance_threshold": -0.00026031518666585866, + "compression/movement_sparsity/linear_layer_sparsity": 0.8941232526801713, + "compression/movement_sparsity/model_sparsity": 0.8634073683057497, + "compression_loss": 151.92002868652344, + "distillation_loss": 7.328000068664551, + "epoch": 1.62, + "learning_rate": 4.189771766694844e-05, + "loss": 158.5577, + "step": 1917, + "task_loss": 3.6128668785095215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4176480867873387, + "compression/movement_sparsity/importance_threshold": -0.0002585874366587571, + "compression/movement_sparsity/linear_layer_sparsity": 0.8943218258438114, + "compression/movement_sparsity/model_sparsity": 0.8635991198703362, + "compression_loss": 151.97763061523438, + "distillation_loss": 7.261575698852539, + "epoch": 1.62, + "learning_rate": 4.189349112426036e-05, + "loss": 158.444, + "step": 1918, + "task_loss": 3.3322975635528564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4181958804174704, + "compression/movement_sparsity/importance_threshold": -0.00025686734850163036, + "compression/movement_sparsity/linear_layer_sparsity": 0.8946176763670247, + "compression/movement_sparsity/model_sparsity": 0.8638848070179346, + "compression_loss": 152.03500366210938, + "distillation_loss": 6.789047718048096, + "epoch": 1.62, + "learning_rate": 4.188926458157227e-05, + "loss": 158.5068, + "step": 1919, + "task_loss": 2.7582054138183594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4187412394132628, + "compression/movement_sparsity/importance_threshold": -0.0002551549051681458, + "compression/movement_sparsity/linear_layer_sparsity": 0.8949093772799008, + "compression/movement_sparsity/model_sparsity": 0.8641664871070764, + "compression_loss": 152.0921630859375, + "distillation_loss": 6.755656719207764, + "epoch": 1.62, + "learning_rate": 4.188503803888419e-05, + "loss": 158.2574, + "step": 1920, + "task_loss": 2.849137783050537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4192841691970637, + "compression/movement_sparsity/importance_threshold": -0.00025345008963196993, + "compression/movement_sparsity/linear_layer_sparsity": 0.8950742527458019, + "compression/movement_sparsity/model_sparsity": 0.8643256985935075, + "compression_loss": 152.14906311035156, + "distillation_loss": 5.173419952392578, + "epoch": 1.62, + "learning_rate": 4.188081149619611e-05, + "loss": 158.2758, + "step": 1921, + "task_loss": 3.542970895767212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4198246751912214, + "compression/movement_sparsity/importance_threshold": -0.00025175288486676845, + "compression/movement_sparsity/linear_layer_sparsity": 0.8953493313689936, + "compression/movement_sparsity/model_sparsity": 0.8645913274197518, + "compression_loss": 152.20555114746094, + "distillation_loss": 5.921302795410156, + "epoch": 1.62, + "learning_rate": 4.187658495350803e-05, + "loss": 158.864, + "step": 1922, + "task_loss": 2.270059823989868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4203627628180842, + "compression/movement_sparsity/importance_threshold": -0.0002500632738462096, + "compression/movement_sparsity/linear_layer_sparsity": 0.8955927990237827, + "compression/movement_sparsity/model_sparsity": 0.8648264312116047, + "compression_loss": 152.26182556152344, + "distillation_loss": 5.613929748535156, + "epoch": 1.63, + "learning_rate": 4.187235841081995e-05, + "loss": 158.1929, + "step": 1923, + "task_loss": 2.4784929752349854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4208984375, + "compression/movement_sparsity/importance_threshold": -0.00024838123954395996, + "compression/movement_sparsity/linear_layer_sparsity": 0.8958665302160316, + "compression/movement_sparsity/model_sparsity": 0.8650907588953042, + "compression_loss": 152.31796264648438, + "distillation_loss": 7.734485626220703, + "epoch": 1.63, + "learning_rate": 4.186813186813187e-05, + "loss": 159.5032, + "step": 1924, + "task_loss": 3.4364078044891357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4214317046593172, + "compression/movement_sparsity/importance_threshold": -0.00024670676493368515, + "compression/movement_sparsity/linear_layer_sparsity": 0.8960579250307548, + "compression/movement_sparsity/model_sparsity": 0.8652755787093424, + "compression_loss": 152.37364196777344, + "distillation_loss": 6.083627700805664, + "epoch": 1.63, + "learning_rate": 4.1863905325443785e-05, + "loss": 158.3182, + "step": 1925, + "task_loss": 3.2751035690307617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4219625697183842, + "compression/movement_sparsity/importance_threshold": -0.0002450398329890526, + "compression/movement_sparsity/linear_layer_sparsity": 0.896364555001511, + "compression/movement_sparsity/model_sparsity": 0.8655716749972989, + "compression_loss": 152.42921447753906, + "distillation_loss": 6.687776565551758, + "epoch": 1.63, + "learning_rate": 4.1859678782755705e-05, + "loss": 158.7509, + "step": 1926, + "task_loss": 2.933615207672119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4224910380995488, + "compression/movement_sparsity/importance_threshold": -0.00024338042668372879, + "compression/movement_sparsity/linear_layer_sparsity": 0.8966406591031194, + "compression/movement_sparsity/model_sparsity": 0.8658382940736216, + "compression_loss": 152.4844970703125, + "distillation_loss": 5.581253528594971, + "epoch": 1.63, + "learning_rate": 4.1855452240067624e-05, + "loss": 158.3622, + "step": 1927, + "task_loss": 2.1447255611419678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4230171152251594, + "compression/movement_sparsity/importance_threshold": -0.0002417285289913803, + "compression/movement_sparsity/linear_layer_sparsity": 0.8968729418886661, + "compression/movement_sparsity/model_sparsity": 0.8660625972308992, + "compression_loss": 152.53945922851562, + "distillation_loss": 5.564861297607422, + "epoch": 1.63, + "learning_rate": 4.185122569737955e-05, + "loss": 158.6945, + "step": 1928, + "task_loss": 4.0267014503479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4235408065175643, + "compression/movement_sparsity/importance_threshold": -0.0002400841228856745, + "compression/movement_sparsity/linear_layer_sparsity": 0.8972512122585785, + "compression/movement_sparsity/model_sparsity": 0.8664278728499094, + "compression_loss": 152.59423828125, + "distillation_loss": 7.034060955047607, + "epoch": 1.63, + "learning_rate": 4.1846999154691464e-05, + "loss": 160.2622, + "step": 1929, + "task_loss": 2.7290186882019043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4240621173991113, + "compression/movement_sparsity/importance_threshold": -0.00023844719134027796, + "compression/movement_sparsity/linear_layer_sparsity": 0.8975155949034009, + "compression/movement_sparsity/model_sparsity": 0.866683173137546, + "compression_loss": 152.64881896972656, + "distillation_loss": 6.278230667114258, + "epoch": 1.63, + "learning_rate": 4.1842772612003383e-05, + "loss": 158.2031, + "step": 1930, + "task_loss": 3.3503847122192383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.424581053292149, + "compression/movement_sparsity/importance_threshold": -0.00023681771732885631, + "compression/movement_sparsity/linear_layer_sparsity": 0.8977801683349054, + "compression/movement_sparsity/model_sparsity": 0.8669386576577552, + "compression_loss": 152.7030029296875, + "distillation_loss": 5.02880334854126, + "epoch": 1.63, + "learning_rate": 4.18385460693153e-05, + "loss": 158.8585, + "step": 1931, + "task_loss": 3.569124698638916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4250976196190255, + "compression/movement_sparsity/importance_threshold": -0.00023519568382507784, + "compression/movement_sparsity/linear_layer_sparsity": 0.8980975062081986, + "compression/movement_sparsity/model_sparsity": 0.8672450939988554, + "compression_loss": 152.757080078125, + "distillation_loss": 8.632002830505371, + "epoch": 1.63, + "learning_rate": 4.1834319526627216e-05, + "loss": 159.2811, + "step": 1932, + "task_loss": 4.659265041351318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.425611821802089, + "compression/movement_sparsity/importance_threshold": -0.0002335810738026082, + "compression/movement_sparsity/linear_layer_sparsity": 0.8982784435279052, + "compression/movement_sparsity/model_sparsity": 0.8674198155650017, + "compression_loss": 152.81076049804688, + "distillation_loss": 8.428467750549316, + "epoch": 1.63, + "learning_rate": 4.183009298393914e-05, + "loss": 159.3197, + "step": 1933, + "task_loss": 4.047605991363525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4261236652636875, + "compression/movement_sparsity/importance_threshold": -0.00023197387023511477, + "compression/movement_sparsity/linear_layer_sparsity": 0.8985142797154073, + "compression/movement_sparsity/model_sparsity": 0.8676475500539462, + "compression_loss": 152.86431884765625, + "distillation_loss": 8.082972526550293, + "epoch": 1.63, + "learning_rate": 4.182586644125106e-05, + "loss": 159.943, + "step": 1934, + "task_loss": 3.781964063644409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4266331554261695, + "compression/movement_sparsity/importance_threshold": -0.00023037405609626325, + "compression/movement_sparsity/linear_layer_sparsity": 0.8987905030586921, + "compression/movement_sparsity/model_sparsity": 0.8679142842756267, + "compression_loss": 152.91769409179688, + "distillation_loss": 7.441792964935303, + "epoch": 1.64, + "learning_rate": 4.1821639898562975e-05, + "loss": 160.1229, + "step": 1935, + "task_loss": 3.8719069957733154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4271402977118828, + "compression/movement_sparsity/importance_threshold": -0.0002287816143597219, + "compression/movement_sparsity/linear_layer_sparsity": 0.8989635942760943, + "compression/movement_sparsity/model_sparsity": 0.8680814292772203, + "compression_loss": 152.97080993652344, + "distillation_loss": 6.845407962799072, + "epoch": 1.64, + "learning_rate": 4.1817413355874895e-05, + "loss": 159.2656, + "step": 1936, + "task_loss": 3.4963345527648926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.427645097543176, + "compression/movement_sparsity/importance_threshold": -0.00022719652799915637, + "compression/movement_sparsity/linear_layer_sparsity": 0.8991851453107687, + "compression/movement_sparsity/model_sparsity": 0.8682953693522828, + "compression_loss": 153.02357482910156, + "distillation_loss": 7.393518447875977, + "epoch": 1.64, + "learning_rate": 4.1813186813186815e-05, + "loss": 159.0549, + "step": 1937, + "task_loss": 3.1124753952026367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.428147560342397, + "compression/movement_sparsity/importance_threshold": -0.00022561877998823407, + "compression/movement_sparsity/linear_layer_sparsity": 0.8994629903408519, + "compression/movement_sparsity/model_sparsity": 0.8685636695508314, + "compression_loss": 153.07615661621094, + "distillation_loss": 7.813177585601807, + "epoch": 1.64, + "learning_rate": 4.1808960270498735e-05, + "loss": 159.6019, + "step": 1938, + "task_loss": 2.669848918914795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4286476915318942, + "compression/movement_sparsity/importance_threshold": -0.00022404835330062066, + "compression/movement_sparsity/linear_layer_sparsity": 0.8996588447942709, + "compression/movement_sparsity/model_sparsity": 0.8687527958012567, + "compression_loss": 153.12831115722656, + "distillation_loss": 6.959723949432373, + "epoch": 1.64, + "learning_rate": 4.1804733727810654e-05, + "loss": 160.1599, + "step": 1939, + "task_loss": 2.681326150894165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4291454965340158, + "compression/movement_sparsity/importance_threshold": -0.00022248523090998354, + "compression/movement_sparsity/linear_layer_sparsity": 0.899899069075463, + "compression/movement_sparsity/model_sparsity": 0.8689847676393736, + "compression_loss": 153.18040466308594, + "distillation_loss": 6.1097941398620605, + "epoch": 1.64, + "learning_rate": 4.1800507185122574e-05, + "loss": 159.2079, + "step": 1940, + "task_loss": 3.562547445297241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.42964098077111, + "compression/movement_sparsity/importance_threshold": -0.00022092939578998924, + "compression/movement_sparsity/linear_layer_sparsity": 0.9002211169949724, + "compression/movement_sparsity/model_sparsity": 0.8692957522221125, + "compression_loss": 153.23208618164062, + "distillation_loss": 5.258612632751465, + "epoch": 1.64, + "learning_rate": 4.1796280642434494e-05, + "loss": 159.4402, + "step": 1941, + "task_loss": 2.9537127017974854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4301341496655244, + "compression/movement_sparsity/importance_threshold": -0.00021938083091430517, + "compression/movement_sparsity/linear_layer_sparsity": 0.9004457802373996, + "compression/movement_sparsity/model_sparsity": 0.8695126975910175, + "compression_loss": 153.28353881835938, + "distillation_loss": 5.468062877655029, + "epoch": 1.64, + "learning_rate": 4.179205409974641e-05, + "loss": 159.5712, + "step": 1942, + "task_loss": 2.7087302207946777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4306250086396082, + "compression/movement_sparsity/importance_threshold": -0.00021783951925659698, + "compression/movement_sparsity/linear_layer_sparsity": 0.900659604411446, + "compression/movement_sparsity/model_sparsity": 0.8697191762468852, + "compression_loss": 153.334716796875, + "distillation_loss": 7.427664756774902, + "epoch": 1.64, + "learning_rate": 4.1787827557058327e-05, + "loss": 159.6785, + "step": 1943, + "task_loss": 3.4780051708221436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4311135631157088, + "compression/movement_sparsity/importance_threshold": -0.00021630544379053207, + "compression/movement_sparsity/linear_layer_sparsity": 0.9010169240188223, + "compression/movement_sparsity/model_sparsity": 0.8700642208265044, + "compression_loss": 153.3856201171875, + "distillation_loss": 6.463655948638916, + "epoch": 1.64, + "learning_rate": 4.1783601014370246e-05, + "loss": 159.6995, + "step": 1944, + "task_loss": 2.984783887863159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.431599818516175, + "compression/movement_sparsity/importance_threshold": -0.00021477858748977698, + "compression/movement_sparsity/linear_layer_sparsity": 0.9014293213565086, + "compression/movement_sparsity/model_sparsity": 0.8704624510469586, + "compression_loss": 153.43626403808594, + "distillation_loss": 8.109355926513672, + "epoch": 1.64, + "learning_rate": 4.1779374471682166e-05, + "loss": 159.4469, + "step": 1945, + "task_loss": 4.172676086425781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4320837802633544, + "compression/movement_sparsity/importance_threshold": -0.00021325893332799824, + "compression/movement_sparsity/linear_layer_sparsity": 0.9016254023691127, + "compression/movement_sparsity/model_sparsity": 0.870651796073564, + "compression_loss": 153.4867706298828, + "distillation_loss": 7.0241312980651855, + "epoch": 1.64, + "learning_rate": 4.1775147928994086e-05, + "loss": 159.8084, + "step": 1946, + "task_loss": 4.404529571533203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4325654537795955, + "compression/movement_sparsity/importance_threshold": -0.00021174646427886325, + "compression/movement_sparsity/linear_layer_sparsity": 0.9018634564517951, + "compression/movement_sparsity/model_sparsity": 0.8708816722661662, + "compression_loss": 153.5367431640625, + "distillation_loss": 6.621567249298096, + "epoch": 1.65, + "learning_rate": 4.1770921386306005e-05, + "loss": 159.4945, + "step": 1947, + "task_loss": 3.0116825103759766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4330448444872466, + "compression/movement_sparsity/importance_threshold": -0.00021024116331603854, + "compression/movement_sparsity/linear_layer_sparsity": 0.9020940340813698, + "compression/movement_sparsity/model_sparsity": 0.8711043288448252, + "compression_loss": 153.58668518066406, + "distillation_loss": 7.074351787567139, + "epoch": 1.65, + "learning_rate": 4.176669484361792e-05, + "loss": 160.0091, + "step": 1948, + "task_loss": 4.423213005065918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4335219578086558, + "compression/movement_sparsity/importance_threshold": -0.00020874301341318978, + "compression/movement_sparsity/linear_layer_sparsity": 0.9024408246487235, + "compression/movement_sparsity/model_sparsity": 0.8714392060893379, + "compression_loss": 153.63632202148438, + "distillation_loss": 8.042926788330078, + "epoch": 1.65, + "learning_rate": 4.176246830092984e-05, + "loss": 160.5751, + "step": 1949, + "task_loss": 3.651385545730591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.433996799166171, + "compression/movement_sparsity/importance_threshold": -0.00020725199754398523, + "compression/movement_sparsity/linear_layer_sparsity": 0.9026384200306174, + "compression/movement_sparsity/model_sparsity": 0.8716300134619892, + "compression_loss": 153.68568420410156, + "distillation_loss": 9.074098587036133, + "epoch": 1.65, + "learning_rate": 4.1758241758241765e-05, + "loss": 160.7414, + "step": 1950, + "task_loss": 3.487760066986084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.434469373982141, + "compression/movement_sparsity/importance_threshold": -0.00020576809868208968, + "compression/movement_sparsity/linear_layer_sparsity": 0.9027966298868101, + "compression/movement_sparsity/model_sparsity": 0.8717827883229112, + "compression_loss": 153.7347869873047, + "distillation_loss": 6.59572696685791, + "epoch": 1.65, + "learning_rate": 4.175401521555368e-05, + "loss": 160.0203, + "step": 1951, + "task_loss": 2.638840913772583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4349396876789133, + "compression/movement_sparsity/importance_threshold": -0.00020429129980117228, + "compression/movement_sparsity/linear_layer_sparsity": 0.9029612549451907, + "compression/movement_sparsity/model_sparsity": 0.8719417580040907, + "compression_loss": 153.78375244140625, + "distillation_loss": 7.987120151519775, + "epoch": 1.65, + "learning_rate": 4.17497886728656e-05, + "loss": 159.8054, + "step": 1952, + "task_loss": 2.8663811683654785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4354077456788368, + "compression/movement_sparsity/importance_threshold": -0.00020282158387489781, + "compression/movement_sparsity/linear_layer_sparsity": 0.9032176603218648, + "compression/movement_sparsity/model_sparsity": 0.8721893550672807, + "compression_loss": 153.83242797851562, + "distillation_loss": 7.214272975921631, + "epoch": 1.65, + "learning_rate": 4.174556213017752e-05, + "loss": 160.1428, + "step": 1953, + "task_loss": 3.519016742706299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.435873553404259, + "compression/movement_sparsity/importance_threshold": -0.00020135893387693456, + "compression/movement_sparsity/linear_layer_sparsity": 0.9034533057226847, + "compression/movement_sparsity/model_sparsity": 0.8724169053236523, + "compression_loss": 153.8809356689453, + "distillation_loss": 6.956715106964111, + "epoch": 1.65, + "learning_rate": 4.174133558748943e-05, + "loss": 160.8754, + "step": 1954, + "task_loss": 3.084801435470581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.436337116277529, + "compression/movement_sparsity/importance_threshold": -0.0001999033327809473, + "compression/movement_sparsity/linear_layer_sparsity": 0.9036167502884694, + "compression/movement_sparsity/model_sparsity": 0.8725747350657881, + "compression_loss": 153.92904663085938, + "distillation_loss": 5.978443145751953, + "epoch": 1.65, + "learning_rate": 4.173710904480136e-05, + "loss": 160.0217, + "step": 1955, + "task_loss": 3.376854419708252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4367984397209939, + "compression/movement_sparsity/importance_threshold": -0.00019845476356060432, + "compression/movement_sparsity/linear_layer_sparsity": 0.9039561001752184, + "compression/movement_sparsity/model_sparsity": 0.8729024272399649, + "compression_loss": 153.97708129882812, + "distillation_loss": 8.853143692016602, + "epoch": 1.65, + "learning_rate": 4.1732882502113276e-05, + "loss": 160.8888, + "step": 1956, + "task_loss": 4.122030735015869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4372575291570024, + "compression/movement_sparsity/importance_threshold": -0.00019701320918957214, + "compression/movement_sparsity/linear_layer_sparsity": 0.9041730007845148, + "compression/movement_sparsity/model_sparsity": 0.8731118766460677, + "compression_loss": 154.0248260498047, + "distillation_loss": 6.055191993713379, + "epoch": 1.65, + "learning_rate": 4.1728655959425196e-05, + "loss": 160.1562, + "step": 1957, + "task_loss": 2.702770948410034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.437714390007903, + "compression/movement_sparsity/importance_threshold": -0.00019557865264151642, + "compression/movement_sparsity/linear_layer_sparsity": 0.9044220012530869, + "compression/movement_sparsity/model_sparsity": 0.8733523231825293, + "compression_loss": 154.07228088378906, + "distillation_loss": 6.207364082336426, + "epoch": 1.65, + "learning_rate": 4.172442941673711e-05, + "loss": 160.834, + "step": 1958, + "task_loss": 3.1016321182250977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4381690276960435, + "compression/movement_sparsity/importance_threshold": -0.00019415107689010544, + "compression/movement_sparsity/linear_layer_sparsity": 0.9046773573030089, + "compression/movement_sparsity/model_sparsity": 0.8735989069665694, + "compression_loss": 154.11944580078125, + "distillation_loss": 7.013033390045166, + "epoch": 1.66, + "learning_rate": 4.172020287404903e-05, + "loss": 160.567, + "step": 1959, + "task_loss": 2.974773406982422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4386214476437722, + "compression/movement_sparsity/importance_threshold": -0.00019273046490900485, + "compression/movement_sparsity/linear_layer_sparsity": 0.9049495860501356, + "compression/movement_sparsity/model_sparsity": 0.8738617838187587, + "compression_loss": 154.16651916503906, + "distillation_loss": 7.334329128265381, + "epoch": 1.66, + "learning_rate": 4.171597633136095e-05, + "loss": 160.7697, + "step": 1960, + "task_loss": 3.0747008323669434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4390716552734375, + "compression/movement_sparsity/importance_threshold": -0.0001913167996718812, + "compression/movement_sparsity/linear_layer_sparsity": 0.9051572931261848, + "compression/movement_sparsity/model_sparsity": 0.8740623555177638, + "compression_loss": 154.21322631835938, + "distillation_loss": 5.816250801086426, + "epoch": 1.66, + "learning_rate": 4.171174978867287e-05, + "loss": 160.3924, + "step": 1961, + "task_loss": 2.6259987354278564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4395196560073873, + "compression/movement_sparsity/importance_threshold": -0.00018991006415240187, + "compression/movement_sparsity/linear_layer_sparsity": 0.9053880853907769, + "compression/movement_sparsity/model_sparsity": 0.8742852193580671, + "compression_loss": 154.2598114013672, + "distillation_loss": 6.2397942543029785, + "epoch": 1.66, + "learning_rate": 4.170752324598479e-05, + "loss": 160.3997, + "step": 1962, + "task_loss": 2.1425631046295166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.43996545526797, + "compression/movement_sparsity/importance_threshold": -0.0001885102413242334, + "compression/movement_sparsity/linear_layer_sparsity": 0.9055441012001245, + "compression/movement_sparsity/model_sparsity": 0.8744358755444029, + "compression_loss": 154.30604553222656, + "distillation_loss": 6.018580436706543, + "epoch": 1.66, + "learning_rate": 4.170329670329671e-05, + "loss": 159.8971, + "step": 1963, + "task_loss": 2.2636215686798096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4404090584775335, + "compression/movement_sparsity/importance_threshold": -0.0001871173141610432, + "compression/movement_sparsity/linear_layer_sparsity": 0.9056863803683557, + "compression/movement_sparsity/model_sparsity": 0.8745732669855034, + "compression_loss": 154.35211181640625, + "distillation_loss": 6.947786331176758, + "epoch": 1.66, + "learning_rate": 4.169907016060862e-05, + "loss": 160.8568, + "step": 1964, + "task_loss": 3.9039864540100098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4408504710584262, + "compression/movement_sparsity/importance_threshold": -0.00018573126563649692, + "compression/movement_sparsity/linear_layer_sparsity": 0.9058814716750461, + "compression/movement_sparsity/model_sparsity": 0.8747616563056378, + "compression_loss": 154.39797973632812, + "distillation_loss": 8.172338485717773, + "epoch": 1.66, + "learning_rate": 4.169484361792054e-05, + "loss": 161.6974, + "step": 1965, + "task_loss": 4.233997344970703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4412896984329964, + "compression/movement_sparsity/importance_threshold": -0.0001843520787242611, + "compression/movement_sparsity/linear_layer_sparsity": 0.9061074465759132, + "compression/movement_sparsity/model_sparsity": 0.8749798682734802, + "compression_loss": 154.44369506835938, + "distillation_loss": 6.936229705810547, + "epoch": 1.66, + "learning_rate": 4.169061707523246e-05, + "loss": 161.4053, + "step": 1966, + "task_loss": 3.916936159133911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4417267460235919, + "compression/movement_sparsity/importance_threshold": -0.000182979736398004, + "compression/movement_sparsity/linear_layer_sparsity": 0.9062437636603264, + "compression/movement_sparsity/model_sparsity": 0.8751115024466835, + "compression_loss": 154.4892578125, + "distillation_loss": 8.030532836914062, + "epoch": 1.66, + "learning_rate": 4.168639053254438e-05, + "loss": 161.321, + "step": 1967, + "task_loss": 4.089968204498291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4421616192525615, + "compression/movement_sparsity/importance_threshold": -0.0001816142216313904, + "compression/movement_sparsity/linear_layer_sparsity": 0.9065616261969957, + "compression/movement_sparsity/model_sparsity": 0.8754184454273585, + "compression_loss": 154.53453063964844, + "distillation_loss": 5.467465877532959, + "epoch": 1.66, + "learning_rate": 4.16821639898563e-05, + "loss": 161.0242, + "step": 1968, + "task_loss": 2.6575448513031006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.442594323542253, + "compression/movement_sparsity/importance_threshold": -0.0001802555173980886, + "compression/movement_sparsity/linear_layer_sparsity": 0.9066594043716097, + "compression/movement_sparsity/model_sparsity": 0.8755128646208737, + "compression_loss": 154.5795440673828, + "distillation_loss": 7.316758155822754, + "epoch": 1.66, + "learning_rate": 4.167793744716822e-05, + "loss": 160.6143, + "step": 1969, + "task_loss": 3.519580125808716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4430248643150148, + "compression/movement_sparsity/importance_threshold": -0.00017890360667176426, + "compression/movement_sparsity/linear_layer_sparsity": 0.9069811422627606, + "compression/movement_sparsity/model_sparsity": 0.875823549825682, + "compression_loss": 154.6243133544922, + "distillation_loss": 6.348191261291504, + "epoch": 1.66, + "learning_rate": 4.167371090448014e-05, + "loss": 160.7449, + "step": 1970, + "task_loss": 3.5273609161376953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4434532469931947, + "compression/movement_sparsity/importance_threshold": -0.00017755847242608477, + "compression/movement_sparsity/linear_layer_sparsity": 0.9072157860334991, + "compression/movement_sparsity/model_sparsity": 0.8760501328610469, + "compression_loss": 154.6687469482422, + "distillation_loss": 5.233705043792725, + "epoch": 1.67, + "learning_rate": 4.166948436179205e-05, + "loss": 160.5417, + "step": 1971, + "task_loss": 2.6867949962615967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4438794769991412, + "compression/movement_sparsity/importance_threshold": -0.00017622009763471665, + "compression/movement_sparsity/linear_layer_sparsity": 0.9072905982612465, + "compression/movement_sparsity/model_sparsity": 0.8761223750586219, + "compression_loss": 154.7130126953125, + "distillation_loss": 5.704310894012451, + "epoch": 1.67, + "learning_rate": 4.166525781910398e-05, + "loss": 161.2749, + "step": 1972, + "task_loss": 2.7655320167541504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4443035597552023, + "compression/movement_sparsity/importance_threshold": -0.00017488846527132645, + "compression/movement_sparsity/linear_layer_sparsity": 0.9074928321443507, + "compression/movement_sparsity/model_sparsity": 0.8763176615856973, + "compression_loss": 154.75701904296875, + "distillation_loss": 5.427280426025391, + "epoch": 1.67, + "learning_rate": 4.16610312764159e-05, + "loss": 161.2508, + "step": 1973, + "task_loss": 4.193160057067871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4447255006837267, + "compression/movement_sparsity/importance_threshold": -0.0001735635583095807, + "compression/movement_sparsity/linear_layer_sparsity": 0.9076441498316499, + "compression/movement_sparsity/model_sparsity": 0.8764637810449301, + "compression_loss": 154.80084228515625, + "distillation_loss": 8.274840354919434, + "epoch": 1.67, + "learning_rate": 4.165680473372781e-05, + "loss": 162.25, + "step": 1974, + "task_loss": 3.675563097000122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4451453052070622, + "compression/movement_sparsity/importance_threshold": -0.0001722453597231468, + "compression/movement_sparsity/linear_layer_sparsity": 0.9078646038429017, + "compression/movement_sparsity/model_sparsity": 0.8766766617826995, + "compression_loss": 154.8443603515625, + "distillation_loss": 5.241974353790283, + "epoch": 1.67, + "learning_rate": 4.165257819103973e-05, + "loss": 160.6418, + "step": 1975, + "task_loss": 2.6789393424987793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.445562978747557, + "compression/movement_sparsity/importance_threshold": -0.00017093385248569126, + "compression/movement_sparsity/linear_layer_sparsity": 0.908017221264473, + "compression/movement_sparsity/model_sparsity": 0.8768240363263339, + "compression_loss": 154.88751220703125, + "distillation_loss": 7.543951988220215, + "epoch": 1.67, + "learning_rate": 4.164835164835165e-05, + "loss": 161.3425, + "step": 1976, + "task_loss": 3.8015730381011963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.445978526727559, + "compression/movement_sparsity/importance_threshold": -0.00016962901957088064, + "compression/movement_sparsity/linear_layer_sparsity": 0.9082465945531165, + "compression/movement_sparsity/model_sparsity": 0.8770455299368777, + "compression_loss": 154.93064880371094, + "distillation_loss": 6.970600128173828, + "epoch": 1.67, + "learning_rate": 4.164412510566357e-05, + "loss": 160.9763, + "step": 1977, + "task_loss": 3.432961940765381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.446391954569417, + "compression/movement_sparsity/importance_threshold": -0.00016833084395238233, + "compression/movement_sparsity/linear_layer_sparsity": 0.908361388514947, + "compression/movement_sparsity/model_sparsity": 0.8771563803729717, + "compression_loss": 154.97344970703125, + "distillation_loss": 6.539155006408691, + "epoch": 1.67, + "learning_rate": 4.163989856297549e-05, + "loss": 161.7528, + "step": 1978, + "task_loss": 3.08008074760437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4468032676954792, + "compression/movement_sparsity/importance_threshold": -0.00016703930860386112, + "compression/movement_sparsity/linear_layer_sparsity": 0.9086290145333663, + "compression/movement_sparsity/model_sparsity": 0.8774148126143444, + "compression_loss": 155.0161895751953, + "distillation_loss": 6.985867977142334, + "epoch": 1.67, + "learning_rate": 4.163567202028741e-05, + "loss": 161.6048, + "step": 1979, + "task_loss": 3.723555564880371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.447212471528093, + "compression/movement_sparsity/importance_threshold": -0.00016575439649898615, + "compression/movement_sparsity/linear_layer_sparsity": 0.9087660947645082, + "compression/movement_sparsity/model_sparsity": 0.8775471837178385, + "compression_loss": 155.05844116210938, + "distillation_loss": 7.06993293762207, + "epoch": 1.67, + "learning_rate": 4.163144547759932e-05, + "loss": 162.1096, + "step": 1980, + "task_loss": 3.500683307647705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4476195714896074, + "compression/movement_sparsity/importance_threshold": -0.00016447609061142222, + "compression/movement_sparsity/linear_layer_sparsity": 0.9088791597220315, + "compression/movement_sparsity/model_sparsity": 0.8776563645462423, + "compression_loss": 155.1007080078125, + "distillation_loss": 6.764288425445557, + "epoch": 1.67, + "learning_rate": 4.162721893491124e-05, + "loss": 162.03, + "step": 1981, + "task_loss": 3.8429129123687744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4480245730023702, + "compression/movement_sparsity/importance_threshold": -0.0001632043739148376, + "compression/movement_sparsity/linear_layer_sparsity": 0.9090550531188281, + "compression/movement_sparsity/model_sparsity": 0.8778262154637475, + "compression_loss": 155.1427459716797, + "distillation_loss": 7.9922990798950195, + "epoch": 1.67, + "learning_rate": 4.162299239222316e-05, + "loss": 161.8125, + "step": 1982, + "task_loss": 3.641343593597412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.44842748148873, + "compression/movement_sparsity/importance_threshold": -0.00016193922938289706, + "compression/movement_sparsity/linear_layer_sparsity": 0.90920025370813, + "compression/movement_sparsity/model_sparsity": 0.8779664279661177, + "compression_loss": 155.18458557128906, + "distillation_loss": 7.973152160644531, + "epoch": 1.68, + "learning_rate": 4.161876584953508e-05, + "loss": 161.307, + "step": 1983, + "task_loss": 2.9139230251312256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4488283023710342, + "compression/movement_sparsity/importance_threshold": -0.00016068063998926976, + "compression/movement_sparsity/linear_layer_sparsity": 0.9093686468034836, + "compression/movement_sparsity/model_sparsity": 0.8781290362406081, + "compression_loss": 155.22616577148438, + "distillation_loss": 5.324052810668945, + "epoch": 1.68, + "learning_rate": 4.1614539306847e-05, + "loss": 161.4149, + "step": 1984, + "task_loss": 3.243025541305542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.449227041071632, + "compression/movement_sparsity/importance_threshold": -0.0001594285887076205, + "compression/movement_sparsity/linear_layer_sparsity": 0.9095258550295948, + "compression/movement_sparsity/model_sparsity": 0.8782808438805234, + "compression_loss": 155.26771545410156, + "distillation_loss": 6.979626655578613, + "epoch": 1.68, + "learning_rate": 4.161031276415892e-05, + "loss": 161.7736, + "step": 1985, + "task_loss": 3.892354965209961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4496237030128707, + "compression/movement_sparsity/importance_threshold": -0.00015818305851161667, + "compression/movement_sparsity/linear_layer_sparsity": 0.9095826498400444, + "compression/movement_sparsity/model_sparsity": 0.8783356876145128, + "compression_loss": 155.30886840820312, + "distillation_loss": 4.373015403747559, + "epoch": 1.68, + "learning_rate": 4.160608622147084e-05, + "loss": 161.0504, + "step": 1986, + "task_loss": 2.4976749420166016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.450018293617099, + "compression/movement_sparsity/importance_threshold": -0.00015694403237492566, + "compression/movement_sparsity/linear_layer_sparsity": 0.909729794068671, + "compression/movement_sparsity/model_sparsity": 0.8784777769862174, + "compression_loss": 155.34994506835938, + "distillation_loss": 5.695289134979248, + "epoch": 1.68, + "learning_rate": 4.1601859678782754e-05, + "loss": 161.8875, + "step": 1987, + "task_loss": 3.5976452827453613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4504108183066649, + "compression/movement_sparsity/importance_threshold": -0.00015571149327121316, + "compression/movement_sparsity/linear_layer_sparsity": 0.9099929962208974, + "compression/movement_sparsity/model_sparsity": 0.8787319373348104, + "compression_loss": 155.3907012939453, + "distillation_loss": 6.410995006561279, + "epoch": 1.68, + "learning_rate": 4.1597633136094674e-05, + "loss": 161.9052, + "step": 1988, + "task_loss": 3.247265338897705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.450801282503917, + "compression/movement_sparsity/importance_threshold": -0.00015448542417414654, + "compression/movement_sparsity/linear_layer_sparsity": 0.9102200442968519, + "compression/movement_sparsity/model_sparsity": 0.8789511856108742, + "compression_loss": 155.43121337890625, + "distillation_loss": 7.766597747802734, + "epoch": 1.68, + "learning_rate": 4.15934065934066e-05, + "loss": 161.5383, + "step": 1989, + "task_loss": 3.7913436889648438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4511896916312026, + "compression/movement_sparsity/importance_threshold": -0.00015326580805739235, + "compression/movement_sparsity/linear_layer_sparsity": 0.9104185220671508, + "compression/movement_sparsity/model_sparsity": 0.8791428450591743, + "compression_loss": 155.47146606445312, + "distillation_loss": 6.82233190536499, + "epoch": 1.68, + "learning_rate": 4.1589180050718514e-05, + "loss": 161.8416, + "step": 1990, + "task_loss": 3.6756303310394287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4515760511108708, + "compression/movement_sparsity/importance_threshold": -0.00015205262789461625, + "compression/movement_sparsity/linear_layer_sparsity": 0.9106232957979615, + "compression/movement_sparsity/model_sparsity": 0.8793405841823739, + "compression_loss": 155.51161193847656, + "distillation_loss": 7.415669918060303, + "epoch": 1.68, + "learning_rate": 4.158495350803043e-05, + "loss": 161.8125, + "step": 1991, + "task_loss": 3.576765537261963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4519603663652696, + "compression/movement_sparsity/importance_threshold": -0.0001508458666594865, + "compression/movement_sparsity/linear_layer_sparsity": 0.9108281529979456, + "compression/movement_sparsity/model_sparsity": 0.8795384039073242, + "compression_loss": 155.5514678955078, + "distillation_loss": 7.50686502456665, + "epoch": 1.68, + "learning_rate": 4.158072696534235e-05, + "loss": 162.2207, + "step": 1992, + "task_loss": 3.2191779613494873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4523426428167467, + "compression/movement_sparsity/importance_threshold": -0.00014964550732566965, + "compression/movement_sparsity/linear_layer_sparsity": 0.9109674153517648, + "compression/movement_sparsity/model_sparsity": 0.8796728821708686, + "compression_loss": 155.59109497070312, + "distillation_loss": 7.088069915771484, + "epoch": 1.68, + "learning_rate": 4.1576500422654266e-05, + "loss": 162.4279, + "step": 1993, + "task_loss": 3.2839553356170654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.452722885887651, + "compression/movement_sparsity/importance_threshold": -0.00014845153286683135, + "compression/movement_sparsity/linear_layer_sparsity": 0.9111283796906814, + "compression/movement_sparsity/model_sparsity": 0.8798283168895591, + "compression_loss": 155.63050842285156, + "distillation_loss": 6.707387447357178, + "epoch": 1.69, + "learning_rate": 4.157227387996619e-05, + "loss": 162.0499, + "step": 1994, + "task_loss": 3.114069700241089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4531011010003303, + "compression/movement_sparsity/importance_threshold": -0.00014726392625663987, + "compression/movement_sparsity/linear_layer_sparsity": 0.9113064790584906, + "compression/movement_sparsity/model_sparsity": 0.8800002979961864, + "compression_loss": 155.6697235107422, + "distillation_loss": 6.702914237976074, + "epoch": 1.69, + "learning_rate": 4.156804733727811e-05, + "loss": 161.6484, + "step": 1995, + "task_loss": 2.559359073638916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4534772935771327, + "compression/movement_sparsity/importance_threshold": -0.00014608267046876, + "compression/movement_sparsity/linear_layer_sparsity": 0.9115579955264339, + "compression/movement_sparsity/model_sparsity": 0.8802431740997007, + "compression_loss": 155.70875549316406, + "distillation_loss": 5.769513130187988, + "epoch": 1.69, + "learning_rate": 4.1563820794590025e-05, + "loss": 161.2939, + "step": 1996, + "task_loss": 2.9781363010406494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4538514690404065, + "compression/movement_sparsity/importance_threshold": -0.00014490774847686003, + "compression/movement_sparsity/linear_layer_sparsity": 0.9116636913483581, + "compression/movement_sparsity/model_sparsity": 0.8803452389449834, + "compression_loss": 155.74737548828125, + "distillation_loss": 6.210541248321533, + "epoch": 1.69, + "learning_rate": 4.1559594251901945e-05, + "loss": 161.9734, + "step": 1997, + "task_loss": 2.5509679317474365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4542236328125, + "compression/movement_sparsity/importance_threshold": -0.00014373914325460646, + "compression/movement_sparsity/linear_layer_sparsity": 0.9118287814492767, + "compression/movement_sparsity/model_sparsity": 0.8805046576930589, + "compression_loss": 155.7859344482422, + "distillation_loss": 5.039443016052246, + "epoch": 1.69, + "learning_rate": 4.1555367709213865e-05, + "loss": 162.1594, + "step": 1998, + "task_loss": 2.883822441101074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4545937903157613, + "compression/movement_sparsity/importance_threshold": -0.00014257683777566497, + "compression/movement_sparsity/linear_layer_sparsity": 0.911973838948567, + "compression/movement_sparsity/model_sparsity": 0.8806447320209995, + "compression_loss": 155.82421875, + "distillation_loss": 6.975203514099121, + "epoch": 1.69, + "learning_rate": 4.1551141166525784e-05, + "loss": 162.5386, + "step": 1999, + "task_loss": 3.747464895248413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4549619469725386, + "compression/movement_sparsity/importance_threshold": -0.0001414208150137047, + "compression/movement_sparsity/linear_layer_sparsity": 0.9121312975821986, + "compression/movement_sparsity/model_sparsity": 0.8807967814661664, + "compression_loss": 155.86231994628906, + "distillation_loss": 5.985924243927002, + "epoch": 1.69, + "learning_rate": 4.1546914623837704e-05, + "loss": 162.0751, + "step": 2000, + "task_loss": 2.508779525756836 + }, + { + "epoch": 1.69, + "eval_accuracy": 0.35116831683168315, + "eval_loss": 162.501953125, + "eval_runtime": 377.4016, + "eval_samples_per_second": 66.905, + "eval_steps_per_second": 0.525, + "step": 2000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.45532810820518, + "compression/movement_sparsity/importance_threshold": -0.0001402710579423887, + "compression/movement_sparsity/linear_layer_sparsity": 0.9123718557400844, + "compression/movement_sparsity/model_sparsity": 0.8810290757112854, + "compression_loss": 155.9002227783203, + "distillation_loss": 8.141429901123047, + "epoch": 1.69, + "learning_rate": 4.1542688081149624e-05, + "loss": 163.2476, + "step": 2001, + "task_loss": 3.9372689723968506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.455692279436034, + "compression/movement_sparsity/importance_threshold": -0.000139127549535387, + "compression/movement_sparsity/linear_layer_sparsity": 0.9125571692293134, + "compression/movement_sparsity/model_sparsity": 0.8812080231120684, + "compression_loss": 155.93775939941406, + "distillation_loss": 7.465813636779785, + "epoch": 1.69, + "learning_rate": 4.1538461538461544e-05, + "loss": 161.7226, + "step": 2002, + "task_loss": 3.714134693145752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4560544660874486, + "compression/movement_sparsity/importance_threshold": -0.00013799027276636434, + "compression/movement_sparsity/linear_layer_sparsity": 0.9126854971214106, + "compression/movement_sparsity/model_sparsity": 0.8813319425462892, + "compression_loss": 155.9752655029297, + "distillation_loss": 6.34322452545166, + "epoch": 1.69, + "learning_rate": 4.153423499577346e-05, + "loss": 162.0157, + "step": 2003, + "task_loss": 3.737729787826538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4564146735817722, + "compression/movement_sparsity/importance_threshold": -0.00013685921060898817, + "compression/movement_sparsity/linear_layer_sparsity": 0.9129124021073534, + "compression/movement_sparsity/model_sparsity": 0.8815510526479235, + "compression_loss": 156.012451171875, + "distillation_loss": 6.389775276184082, + "epoch": 1.69, + "learning_rate": 4.1530008453085376e-05, + "loss": 161.6535, + "step": 2004, + "task_loss": 3.454962730407715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4567729073413525, + "compression/movement_sparsity/importance_threshold": -0.00013573434603692588, + "compression/movement_sparsity/linear_layer_sparsity": 0.9131784899081477, + "compression/movement_sparsity/model_sparsity": 0.8818079995141787, + "compression_loss": 156.04942321777344, + "distillation_loss": 9.083995819091797, + "epoch": 1.69, + "learning_rate": 4.1525781910397296e-05, + "loss": 163.7404, + "step": 2005, + "task_loss": 4.205441951751709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4571291727885378, + "compression/movement_sparsity/importance_threshold": -0.00013461566202384312, + "compression/movement_sparsity/linear_layer_sparsity": 0.9133159517126539, + "compression/movement_sparsity/model_sparsity": 0.8819407390828182, + "compression_loss": 156.086181640625, + "distillation_loss": 6.303823471069336, + "epoch": 1.7, + "learning_rate": 4.1521555367709216e-05, + "loss": 162.5442, + "step": 2006, + "task_loss": 3.544614315032959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.457483475345677, + "compression/movement_sparsity/importance_threshold": -0.00013350314154340643, + "compression/movement_sparsity/linear_layer_sparsity": 0.9133968810383984, + "compression/movement_sparsity/model_sparsity": 0.8820188882372557, + "compression_loss": 156.1227569580078, + "distillation_loss": 7.097990036010742, + "epoch": 1.7, + "learning_rate": 4.1517328825021136e-05, + "loss": 162.3194, + "step": 2007, + "task_loss": 2.5408976078033447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4578358204351174, + "compression/movement_sparsity/importance_threshold": -0.0001323967675692832, + "compression/movement_sparsity/linear_layer_sparsity": 0.9134877074232809, + "compression/movement_sparsity/model_sparsity": 0.8821065944564027, + "compression_loss": 156.1591339111328, + "distillation_loss": 6.6025390625, + "epoch": 1.7, + "learning_rate": 4.1513102282333055e-05, + "loss": 162.0804, + "step": 2008, + "task_loss": 4.805090427398682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4581862134792076, + "compression/movement_sparsity/importance_threshold": -0.00013129652307514086, + "compression/movement_sparsity/linear_layer_sparsity": 0.9137236509282917, + "compression/movement_sparsity/model_sparsity": 0.8823344325761693, + "compression_loss": 156.19512939453125, + "distillation_loss": 5.5980224609375, + "epoch": 1.7, + "learning_rate": 4.150887573964497e-05, + "loss": 162.647, + "step": 2009, + "task_loss": 3.799257278442383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.458534659900296, + "compression/movement_sparsity/importance_threshold": -0.00013020239103464417, + "compression/movement_sparsity/linear_layer_sparsity": 0.9138732634596188, + "compression/movement_sparsity/model_sparsity": 0.8824789054567834, + "compression_loss": 156.2310028076172, + "distillation_loss": 5.673425674438477, + "epoch": 1.7, + "learning_rate": 4.150464919695689e-05, + "loss": 162.1084, + "step": 2010, + "task_loss": 3.0742673873901367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4588811651207303, + "compression/movement_sparsity/importance_threshold": -0.00012911435442146228, + "compression/movement_sparsity/linear_layer_sparsity": 0.9140479644396519, + "compression/movement_sparsity/model_sparsity": 0.8826476049207093, + "compression_loss": 156.26663208007812, + "distillation_loss": 7.06057071685791, + "epoch": 1.7, + "learning_rate": 4.1500422654268815e-05, + "loss": 162.7097, + "step": 2011, + "task_loss": 2.999030113220215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.459225734562859, + "compression/movement_sparsity/importance_threshold": -0.00012803239620926, + "compression/movement_sparsity/linear_layer_sparsity": 0.9140976166736876, + "compression/movement_sparsity/model_sparsity": 0.8826955514477577, + "compression_loss": 156.3020782470703, + "distillation_loss": 7.243545055389404, + "epoch": 1.7, + "learning_rate": 4.149619611158073e-05, + "loss": 163.2383, + "step": 2012, + "task_loss": 2.957284927368164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4595683736490304, + "compression/movement_sparsity/importance_threshold": -0.00012695649937170383, + "compression/movement_sparsity/linear_layer_sparsity": 0.9143062061381417, + "compression/movement_sparsity/model_sparsity": 0.8828969752224116, + "compression_loss": 156.33729553222656, + "distillation_loss": 5.708771705627441, + "epoch": 1.7, + "learning_rate": 4.149196956889265e-05, + "loss": 161.5347, + "step": 2013, + "task_loss": 2.653768301010132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4599090878015926, + "compression/movement_sparsity/importance_threshold": -0.00012588664688246206, + "compression/movement_sparsity/linear_layer_sparsity": 0.9144212862799955, + "compression/movement_sparsity/model_sparsity": 0.8830081020073647, + "compression_loss": 156.3724365234375, + "distillation_loss": 6.637938499450684, + "epoch": 1.7, + "learning_rate": 4.148774302620457e-05, + "loss": 162.8613, + "step": 2014, + "task_loss": 2.731832265853882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4602478824428933, + "compression/movement_sparsity/importance_threshold": -0.00012482282171520122, + "compression/movement_sparsity/linear_layer_sparsity": 0.9145664510967945, + "compression/movement_sparsity/model_sparsity": 0.8831482799661274, + "compression_loss": 156.4072723388672, + "distillation_loss": 7.97456693649292, + "epoch": 1.7, + "learning_rate": 4.148351648351649e-05, + "loss": 162.577, + "step": 2015, + "task_loss": 3.6928513050079346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4605847629952815, + "compression/movement_sparsity/importance_threshold": -0.00012376500684358697, + "compression/movement_sparsity/linear_layer_sparsity": 0.9146595669218629, + "compression/movement_sparsity/model_sparsity": 0.883238196976147, + "compression_loss": 156.44183349609375, + "distillation_loss": 6.399290084838867, + "epoch": 1.7, + "learning_rate": 4.14792899408284e-05, + "loss": 163.7199, + "step": 2016, + "task_loss": 4.422609806060791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4609197348811052, + "compression/movement_sparsity/importance_threshold": -0.0001227131852412867, + "compression/movement_sparsity/linear_layer_sparsity": 0.9148900253097613, + "compression/movement_sparsity/model_sparsity": 0.8834607384094482, + "compression_loss": 156.476318359375, + "distillation_loss": 7.382699012756348, + "epoch": 1.7, + "learning_rate": 4.1475063398140326e-05, + "loss": 162.7684, + "step": 2017, + "task_loss": 4.515773773193359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4612528035227121, + "compression/movement_sparsity/importance_threshold": -0.00012166733988196697, + "compression/movement_sparsity/linear_layer_sparsity": 0.9151391927166802, + "compression/movement_sparsity/model_sparsity": 0.8837013461494109, + "compression_loss": 156.5106658935547, + "distillation_loss": 7.100282669067383, + "epoch": 1.71, + "learning_rate": 4.1470836855452246e-05, + "loss": 163.0127, + "step": 2018, + "task_loss": 3.40665602684021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4615839743424508, + "compression/movement_sparsity/importance_threshold": -0.00012062745373929429, + "compression/movement_sparsity/linear_layer_sparsity": 0.9153970886143087, + "compression/movement_sparsity/model_sparsity": 0.8839503825295751, + "compression_loss": 156.54478454589844, + "distillation_loss": 6.993110656738281, + "epoch": 1.71, + "learning_rate": 4.146661031276416e-05, + "loss": 163.0793, + "step": 2019, + "task_loss": 3.9153037071228027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4619132527626695, + "compression/movement_sparsity/importance_threshold": -0.00011959350978693519, + "compression/movement_sparsity/linear_layer_sparsity": 0.9155058847198183, + "compression/movement_sparsity/model_sparsity": 0.8840554411541646, + "compression_loss": 156.5786590576172, + "distillation_loss": 7.774040699005127, + "epoch": 1.71, + "learning_rate": 4.146238377007608e-05, + "loss": 163.3161, + "step": 2020, + "task_loss": 3.7484071254730225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4622406442057163, + "compression/movement_sparsity/importance_threshold": -0.00011856549099855708, + "compression/movement_sparsity/linear_layer_sparsity": 0.9155900693433275, + "compression/movement_sparsity/model_sparsity": 0.8841367337768741, + "compression_loss": 156.61244201660156, + "distillation_loss": 7.001641273498535, + "epoch": 1.71, + "learning_rate": 4.1458157227388e-05, + "loss": 162.8766, + "step": 2021, + "task_loss": 3.392859935760498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4625661540939394, + "compression/movement_sparsity/importance_threshold": -0.00011754338034782735, + "compression/movement_sparsity/linear_layer_sparsity": 0.9157533708191007, + "compression/movement_sparsity/model_sparsity": 0.8842944253445802, + "compression_loss": 156.64599609375, + "distillation_loss": 6.925978660583496, + "epoch": 1.71, + "learning_rate": 4.145393068469992e-05, + "loss": 162.8029, + "step": 2022, + "task_loss": 3.8084402084350586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.462889787849687, + "compression/movement_sparsity/importance_threshold": -0.00011652716080841081, + "compression/movement_sparsity/linear_layer_sparsity": 0.9159962899621784, + "compression/movement_sparsity/model_sparsity": 0.8845289994677866, + "compression_loss": 156.6792755126953, + "distillation_loss": 5.965188026428223, + "epoch": 1.71, + "learning_rate": 4.144970414201184e-05, + "loss": 162.1668, + "step": 2023, + "task_loss": 3.324993133544922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4632115508953074, + "compression/movement_sparsity/importance_threshold": -0.00011551681535397571, + "compression/movement_sparsity/linear_layer_sparsity": 0.9161209332864761, + "compression/movement_sparsity/model_sparsity": 0.884649360910447, + "compression_loss": 156.71253967285156, + "distillation_loss": 6.55002498626709, + "epoch": 1.71, + "learning_rate": 4.144547759932376e-05, + "loss": 162.72, + "step": 2024, + "task_loss": 3.754878282546997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4635314486531485, + "compression/movement_sparsity/importance_threshold": -0.00011451232695818773, + "compression/movement_sparsity/linear_layer_sparsity": 0.9162370150584113, + "compression/movement_sparsity/model_sparsity": 0.8847614549164068, + "compression_loss": 156.7454833984375, + "distillation_loss": 7.326180458068848, + "epoch": 1.71, + "learning_rate": 4.144125105663567e-05, + "loss": 163.1221, + "step": 2025, + "task_loss": 2.8257737159729004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4638494865455587, + "compression/movement_sparsity/importance_threshold": -0.00011351367859471426, + "compression/movement_sparsity/linear_layer_sparsity": 0.916367322362336, + "compression/movement_sparsity/model_sparsity": 0.8848872857635696, + "compression_loss": 156.77821350097656, + "distillation_loss": 5.909095287322998, + "epoch": 1.71, + "learning_rate": 4.143702451394759e-05, + "loss": 162.6844, + "step": 2026, + "task_loss": 3.5221149921417236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4641656699948862, + "compression/movement_sparsity/importance_threshold": -0.00011252085323722183, + "compression/movement_sparsity/linear_layer_sparsity": 0.9165036990675873, + "compression/movement_sparsity/model_sparsity": 0.8850189775094517, + "compression_loss": 156.8107452392578, + "distillation_loss": 7.84874153137207, + "epoch": 1.71, + "learning_rate": 4.143279797125951e-05, + "loss": 163.5946, + "step": 2027, + "task_loss": 4.277594566345215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4644800044234794, + "compression/movement_sparsity/importance_threshold": -0.00011153383385937785, + "compression/movement_sparsity/linear_layer_sparsity": 0.9166309537845972, + "compression/movement_sparsity/model_sparsity": 0.8851418606354511, + "compression_loss": 156.8430633544922, + "distillation_loss": 8.344863891601562, + "epoch": 1.71, + "learning_rate": 4.1428571428571437e-05, + "loss": 163.4625, + "step": 2028, + "task_loss": 3.5164167881011963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4647924952536862, + "compression/movement_sparsity/importance_threshold": -0.0001105526034348471, + "compression/movement_sparsity/linear_layer_sparsity": 0.9167332392945777, + "compression/movement_sparsity/model_sparsity": 0.8852406323234967, + "compression_loss": 156.87533569335938, + "distillation_loss": 7.735446453094482, + "epoch": 1.71, + "learning_rate": 4.142434488588335e-05, + "loss": 162.9165, + "step": 2029, + "task_loss": 3.5454776287078857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4651031479078545, + "compression/movement_sparsity/importance_threshold": -0.00010957714493729873, + "compression/movement_sparsity/linear_layer_sparsity": 0.9168824583283729, + "compression/movement_sparsity/model_sparsity": 0.8853847252244296, + "compression_loss": 156.9072265625, + "distillation_loss": 7.075920104980469, + "epoch": 1.72, + "learning_rate": 4.142011834319527e-05, + "loss": 163.4122, + "step": 2030, + "task_loss": 3.234644889831543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4654119678083333, + "compression/movement_sparsity/importance_threshold": -0.00010860744134039754, + "compression/movement_sparsity/linear_layer_sparsity": 0.9170195504836824, + "compression/movement_sparsity/model_sparsity": 0.8855171078424594, + "compression_loss": 156.939208984375, + "distillation_loss": 5.532312393188477, + "epoch": 1.72, + "learning_rate": 4.141589180050719e-05, + "loss": 163.3148, + "step": 2031, + "task_loss": 2.670731544494629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4657189603774703, + "compression/movement_sparsity/importance_threshold": -0.00010764347561781178, + "compression/movement_sparsity/linear_layer_sparsity": 0.917132901621229, + "compression/movement_sparsity/model_sparsity": 0.8856265650197223, + "compression_loss": 156.97080993652344, + "distillation_loss": 6.753641128540039, + "epoch": 1.72, + "learning_rate": 4.14116652578191e-05, + "loss": 162.9724, + "step": 2032, + "task_loss": 2.9162168502807617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4660241310376136, + "compression/movement_sparsity/importance_threshold": -0.00010668523074320625, + "compression/movement_sparsity/linear_layer_sparsity": 0.9173215896498988, + "compression/movement_sparsity/model_sparsity": 0.8858087710341351, + "compression_loss": 157.00216674804688, + "distillation_loss": 8.310647964477539, + "epoch": 1.72, + "learning_rate": 4.140743871513102e-05, + "loss": 163.1907, + "step": 2033, + "task_loss": 4.008687973022461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4663274852111117, + "compression/movement_sparsity/importance_threshold": -0.0001057326896902501, + "compression/movement_sparsity/linear_layer_sparsity": 0.9174730981238801, + "compression/movement_sparsity/model_sparsity": 0.8859550747259405, + "compression_loss": 157.03335571289062, + "distillation_loss": 6.653524398803711, + "epoch": 1.72, + "learning_rate": 4.140321217244295e-05, + "loss": 163.1501, + "step": 2034, + "task_loss": 2.6538071632385254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4666290283203125, + "compression/movement_sparsity/importance_threshold": -0.00010478583543260811, + "compression/movement_sparsity/linear_layer_sparsity": 0.9175802963909265, + "compression/movement_sparsity/model_sparsity": 0.8860585904027335, + "compression_loss": 157.06422424316406, + "distillation_loss": 4.820926666259766, + "epoch": 1.72, + "learning_rate": 4.139898562975486e-05, + "loss": 162.9441, + "step": 2035, + "task_loss": 2.164738893508911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4669287657875645, + "compression/movement_sparsity/importance_threshold": -0.00010384465094394769, + "compression/movement_sparsity/linear_layer_sparsity": 0.9177019944458181, + "compression/movement_sparsity/model_sparsity": 0.8861761077550525, + "compression_loss": 157.09498596191406, + "distillation_loss": 6.006706714630127, + "epoch": 1.72, + "learning_rate": 4.139475908706678e-05, + "loss": 163.1836, + "step": 2036, + "task_loss": 3.0218958854675293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4672267030352155, + "compression/movement_sparsity/importance_threshold": -0.00010290911919793623, + "compression/movement_sparsity/linear_layer_sparsity": 0.9178168361043191, + "compression/movement_sparsity/model_sparsity": 0.8862870042492897, + "compression_loss": 157.12550354003906, + "distillation_loss": 7.793644905090332, + "epoch": 1.72, + "learning_rate": 4.13905325443787e-05, + "loss": 164.2727, + "step": 2037, + "task_loss": 4.520610332489014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4675228454856142, + "compression/movement_sparsity/importance_threshold": -0.00010197922316823854, + "compression/movement_sparsity/linear_layer_sparsity": 0.917902535097118, + "compression/movement_sparsity/model_sparsity": 0.8863697592180451, + "compression_loss": 157.15582275390625, + "distillation_loss": 7.076512336730957, + "epoch": 1.72, + "learning_rate": 4.1386306001690614e-05, + "loss": 163.6283, + "step": 2038, + "task_loss": 3.4475290775299072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4678171985611084, + "compression/movement_sparsity/importance_threshold": -0.00010105494582852374, + "compression/movement_sparsity/linear_layer_sparsity": 0.9180419643892842, + "compression/movement_sparsity/model_sparsity": 0.8865043986850907, + "compression_loss": 157.18597412109375, + "distillation_loss": 6.465146541595459, + "epoch": 1.72, + "learning_rate": 4.138207945900254e-05, + "loss": 163.2867, + "step": 2039, + "task_loss": 2.3278679847717285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4681097676840464, + "compression/movement_sparsity/importance_threshold": -0.00010013627015245576, + "compression/movement_sparsity/linear_layer_sparsity": 0.918228494143612, + "compression/movement_sparsity/model_sparsity": 0.8866845205685246, + "compression_loss": 157.2158660888672, + "distillation_loss": 6.347691535949707, + "epoch": 1.72, + "learning_rate": 4.137785291631446e-05, + "loss": 163.8396, + "step": 2040, + "task_loss": 3.275932788848877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4684005582767765, + "compression/movement_sparsity/importance_threshold": -9.922317911370462e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9183386973009027, + "compression/movement_sparsity/model_sparsity": 0.8867909379083379, + "compression_loss": 157.2456817626953, + "distillation_loss": 5.799373626708984, + "epoch": 1.72, + "learning_rate": 4.137362637362637e-05, + "loss": 163.0939, + "step": 2041, + "task_loss": 2.822983980178833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4686895757616467, + "compression/movement_sparsity/importance_threshold": -9.83156556859351e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185223891033332, + "compression/movement_sparsity/model_sparsity": 0.8869683193322527, + "compression_loss": 157.27516174316406, + "distillation_loss": 8.249189376831055, + "epoch": 1.73, + "learning_rate": 4.136939983093829e-05, + "loss": 164.4134, + "step": 2042, + "task_loss": 4.526817798614502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4689768255610054, + "compression/movement_sparsity/importance_threshold": -9.741368284281373e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9187148332448083, + "compression/movement_sparsity/model_sparsity": 0.8871541524254408, + "compression_loss": 157.30471801757812, + "distillation_loss": 7.420807838439941, + "epoch": 1.73, + "learning_rate": 4.136517328825021e-05, + "loss": 163.9933, + "step": 2043, + "task_loss": 3.103855609893799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4692623130972007, + "compression/movement_sparsity/importance_threshold": -9.651724355800792e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9188981315497068, + "compression/movement_sparsity/model_sparsity": 0.8873311538696745, + "compression_loss": 157.33387756347656, + "distillation_loss": 5.574035167694092, + "epoch": 1.73, + "learning_rate": 4.136094674556213e-05, + "loss": 164.164, + "step": 2044, + "task_loss": 1.412148356437683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4695460437925807, + "compression/movement_sparsity/importance_threshold": -9.562632080518333e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.91904191316306, + "compression/movement_sparsity/model_sparsity": 0.8874699961422851, + "compression_loss": 157.36293029785156, + "distillation_loss": 6.073769569396973, + "epoch": 1.73, + "learning_rate": 4.135672020287405e-05, + "loss": 163.5702, + "step": 2045, + "task_loss": 3.0423617362976074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4698280230694938, + "compression/movement_sparsity/importance_threshold": -9.474089755800823e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9191824752511516, + "compression/movement_sparsity/model_sparsity": 0.8876057294902311, + "compression_loss": 157.3916015625, + "distillation_loss": 6.953325271606445, + "epoch": 1.73, + "learning_rate": 4.135249366018597e-05, + "loss": 163.5973, + "step": 2046, + "task_loss": 2.9939026832580566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4701082563502879, + "compression/movement_sparsity/importance_threshold": -9.386095679014914e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9192988789756129, + "compression/movement_sparsity/model_sparsity": 0.8877181343886574, + "compression_loss": 157.42037963867188, + "distillation_loss": 5.951033592224121, + "epoch": 1.73, + "learning_rate": 4.134826711749789e-05, + "loss": 164.2216, + "step": 2047, + "task_loss": 2.8542072772979736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4703867490573115, + "compression/movement_sparsity/importance_threshold": -9.298648147527174e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9194350410458468, + "compression/movement_sparsity/model_sparsity": 0.8878496188728953, + "compression_loss": 157.4488067626953, + "distillation_loss": 7.951514720916748, + "epoch": 1.73, + "learning_rate": 4.1344040574809804e-05, + "loss": 163.5652, + "step": 2048, + "task_loss": 3.86883807182312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4706635066129126, + "compression/movement_sparsity/importance_threshold": -9.211745458704342e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9196636631119292, + "compression/movement_sparsity/model_sparsity": 0.888070387067684, + "compression_loss": 157.4771270751953, + "distillation_loss": 6.9326043128967285, + "epoch": 1.73, + "learning_rate": 4.1339814032121724e-05, + "loss": 163.7969, + "step": 2049, + "task_loss": 3.4746665954589844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4709385344394397, + "compression/movement_sparsity/importance_threshold": -9.125385909913072e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9198505625154537, + "compression/movement_sparsity/model_sparsity": 0.8882508659017276, + "compression_loss": 157.5052490234375, + "distillation_loss": 8.950945854187012, + "epoch": 1.73, + "learning_rate": 4.1335587489433644e-05, + "loss": 164.6084, + "step": 2050, + "task_loss": 3.856461524963379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4712118379592405, + "compression/movement_sparsity/importance_threshold": -9.039567798520016e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.91997596898648, + "compression/movement_sparsity/model_sparsity": 0.8883719642746788, + "compression_loss": 157.53318786621094, + "distillation_loss": 6.731488227844238, + "epoch": 1.73, + "learning_rate": 4.1331360946745563e-05, + "loss": 164.1801, + "step": 2051, + "task_loss": 3.1885368824005127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4714834225946638, + "compression/movement_sparsity/importance_threshold": -8.954289421891914e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200560755446578, + "compression/movement_sparsity/model_sparsity": 0.8884493189261465, + "compression_loss": 157.56094360351562, + "distillation_loss": 6.878177165985107, + "epoch": 1.73, + "learning_rate": 4.132713440405748e-05, + "loss": 164.0798, + "step": 2052, + "task_loss": 2.3549516201019287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.471753293768057, + "compression/movement_sparsity/importance_threshold": -8.869549077395421e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202378714044341, + "compression/movement_sparsity/model_sparsity": 0.8886248695388701, + "compression_loss": 157.5885009765625, + "distillation_loss": 6.572344779968262, + "epoch": 1.73, + "learning_rate": 4.13229078613694e-05, + "loss": 163.6014, + "step": 2053, + "task_loss": 2.567559242248535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.472021456901769, + "compression/movement_sparsity/importance_threshold": -8.785345062397101e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9203249178281759, + "compression/movement_sparsity/model_sparsity": 0.8887089256501702, + "compression_loss": 157.61595153808594, + "distillation_loss": 5.47068452835083, + "epoch": 1.74, + "learning_rate": 4.1318681318681316e-05, + "loss": 163.2955, + "step": 2054, + "task_loss": 2.1045193672180176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4722879174181478, + "compression/movement_sparsity/importance_threshold": -8.701675674263696e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9203500420493848, + "compression/movement_sparsity/model_sparsity": 0.8887331867770893, + "compression_loss": 157.64317321777344, + "distillation_loss": 5.3615288734436035, + "epoch": 1.74, + "learning_rate": 4.1314454775993236e-05, + "loss": 163.4814, + "step": 2055, + "task_loss": 2.523104667663574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4725526807395415, + "compression/movement_sparsity/importance_threshold": -8.618539210361945e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9204586831407151, + "compression/movement_sparsity/model_sparsity": 0.8888380957127134, + "compression_loss": 157.670166015625, + "distillation_loss": 7.267078876495361, + "epoch": 1.74, + "learning_rate": 4.131022823330516e-05, + "loss": 163.8252, + "step": 2056, + "task_loss": 3.9273033142089844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4728157522882985, + "compression/movement_sparsity/importance_threshold": -8.5359339680585e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9206906320495679, + "compression/movement_sparsity/model_sparsity": 0.8890620764629888, + "compression_loss": 157.6969757080078, + "distillation_loss": 5.640903472900391, + "epoch": 1.74, + "learning_rate": 4.130600169061708e-05, + "loss": 163.6917, + "step": 2057, + "task_loss": 3.4636130332946777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4730771374867666, + "compression/movement_sparsity/importance_threshold": -8.45385824471993e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.920845586607996, + "compression/movement_sparsity/model_sparsity": 0.8892117078556389, + "compression_loss": 157.72372436523438, + "distillation_loss": 6.200067043304443, + "epoch": 1.74, + "learning_rate": 4.1301775147928995e-05, + "loss": 164.0651, + "step": 2058, + "task_loss": 3.0539345741271973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4733368417572943, + "compression/movement_sparsity/importance_threshold": -8.372310337712973e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9209980490153881, + "compression/movement_sparsity/model_sparsity": 0.8893589327103079, + "compression_loss": 157.75021362304688, + "distillation_loss": 6.288613319396973, + "epoch": 1.74, + "learning_rate": 4.1297548605240915e-05, + "loss": 163.8086, + "step": 2059, + "task_loss": 3.4023566246032715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4735948705222297, + "compression/movement_sparsity/importance_threshold": -8.29128854440437e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9211175053267642, + "compression/movement_sparsity/model_sparsity": 0.8894742853298976, + "compression_loss": 157.77650451660156, + "distillation_loss": 6.743499755859375, + "epoch": 1.74, + "learning_rate": 4.1293322062552834e-05, + "loss": 164.5779, + "step": 2060, + "task_loss": 3.0008227825164795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.473851229203921, + "compression/movement_sparsity/importance_threshold": -8.210791162160686e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9212479557207005, + "compression/movement_sparsity/model_sparsity": 0.8896002543514899, + "compression_loss": 157.802734375, + "distillation_loss": 6.0286865234375, + "epoch": 1.74, + "learning_rate": 4.1289095519864754e-05, + "loss": 163.6875, + "step": 2061, + "task_loss": 2.7711236476898193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4741059232247165, + "compression/movement_sparsity/importance_threshold": -8.130816488348663e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9213412861807864, + "compression/movement_sparsity/model_sparsity": 0.8896903786231538, + "compression_loss": 157.82858276367188, + "distillation_loss": 6.389291286468506, + "epoch": 1.74, + "learning_rate": 4.1284868977176674e-05, + "loss": 163.635, + "step": 2062, + "task_loss": 3.5396840572357178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4743589580069643, + "compression/movement_sparsity/importance_threshold": -8.051362820334865e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9213777502854169, + "compression/movement_sparsity/model_sparsity": 0.8897255900736135, + "compression_loss": 157.85430908203125, + "distillation_loss": 6.952322959899902, + "epoch": 1.74, + "learning_rate": 4.1280642434488594e-05, + "loss": 164.5493, + "step": 2063, + "task_loss": 3.490713596343994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4746103389730125, + "compression/movement_sparsity/importance_threshold": -7.972428455486121e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.921456533260987, + "compression/movement_sparsity/model_sparsity": 0.889801666611608, + "compression_loss": 157.87986755371094, + "distillation_loss": 9.75994873046875, + "epoch": 1.74, + "learning_rate": 4.1276415891800506e-05, + "loss": 165.1418, + "step": 2064, + "task_loss": 4.526374816894531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4748600715452094, + "compression/movement_sparsity/importance_threshold": -7.894011691168996e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9215729489096159, + "compression/movement_sparsity/model_sparsity": 0.8899140830245701, + "compression_loss": 157.90533447265625, + "distillation_loss": 6.862576484680176, + "epoch": 1.75, + "learning_rate": 4.1272189349112426e-05, + "loss": 164.3189, + "step": 2065, + "task_loss": 3.1907973289489746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4751081611459034, + "compression/movement_sparsity/importance_threshold": -7.816110824750231e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.921751644485807, + "compression/movement_sparsity/model_sparsity": 0.8900866398579871, + "compression_loss": 157.9305419921875, + "distillation_loss": 8.265615463256836, + "epoch": 1.75, + "learning_rate": 4.1267962806424346e-05, + "loss": 164.766, + "step": 2066, + "task_loss": 3.42014741897583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.475354613197442, + "compression/movement_sparsity/importance_threshold": -7.738724153596478e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9218749284549942, + "compression/movement_sparsity/model_sparsity": 0.8902056886435669, + "compression_loss": 157.95558166503906, + "distillation_loss": 6.264739036560059, + "epoch": 1.75, + "learning_rate": 4.1263736263736266e-05, + "loss": 164.4062, + "step": 2067, + "task_loss": 3.4520792961120605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4755994331221745, + "compression/movement_sparsity/importance_threshold": -7.661849975074304e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9220492597858305, + "compression/movement_sparsity/model_sparsity": 0.8903740311568831, + "compression_loss": 157.98036193847656, + "distillation_loss": 8.067062377929688, + "epoch": 1.75, + "learning_rate": 4.1259509721048185e-05, + "loss": 164.8944, + "step": 2068, + "task_loss": 3.4594380855560303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4758426263424482, + "compression/movement_sparsity/importance_threshold": -7.58548658655045e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.922238126677015, + "compression/movement_sparsity/model_sparsity": 0.8905564098893327, + "compression_loss": 158.00497436523438, + "distillation_loss": 8.489025115966797, + "epoch": 1.75, + "learning_rate": 4.1255283178360105e-05, + "loss": 165.0319, + "step": 2069, + "task_loss": 3.4526848793029785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4760841982806114, + "compression/movement_sparsity/importance_threshold": -7.509632285391653e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223003231354036, + "compression/movement_sparsity/model_sparsity": 0.8906164697080371, + "compression_loss": 158.0295867919922, + "distillation_loss": 7.891172409057617, + "epoch": 1.75, + "learning_rate": 4.125105663567202e-05, + "loss": 164.8274, + "step": 2070, + "task_loss": 3.8329386711120605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4763241543590127, + "compression/movement_sparsity/importance_threshold": -7.43428536896457e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9224427096211434, + "compression/movement_sparsity/model_sparsity": 0.8907539647799597, + "compression_loss": 158.053955078125, + "distillation_loss": 6.789915561676025, + "epoch": 1.75, + "learning_rate": 4.124683009298394e-05, + "loss": 164.2328, + "step": 2071, + "task_loss": 3.3769826889038086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4765625, + "compression/movement_sparsity/importance_threshold": -7.359444134635851e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9225536163043246, + "compression/movement_sparsity/model_sparsity": 0.8908610614773848, + "compression_loss": 158.0781707763672, + "distillation_loss": 7.640774726867676, + "epoch": 1.75, + "learning_rate": 4.124260355029586e-05, + "loss": 164.7641, + "step": 2072, + "task_loss": 2.1207337379455566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4767992406259216, + "compression/movement_sparsity/importance_threshold": -7.28510687977215e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9226302648538879, + "compression/movement_sparsity/model_sparsity": 0.8909350769134721, + "compression_loss": 158.10218811035156, + "distillation_loss": 7.081815719604492, + "epoch": 1.75, + "learning_rate": 4.1238377007607784e-05, + "loss": 164.846, + "step": 2073, + "task_loss": 3.063659906387329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4770343816591256, + "compression/movement_sparsity/importance_threshold": -7.211271901740122e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9227174185951384, + "compression/movement_sparsity/model_sparsity": 0.8910192366555943, + "compression_loss": 158.12603759765625, + "distillation_loss": 7.716804504394531, + "epoch": 1.75, + "learning_rate": 4.12341504649197e-05, + "loss": 164.3071, + "step": 2074, + "task_loss": 3.098231792449951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.47726792852196, + "compression/movement_sparsity/importance_threshold": -7.137937497906505e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9227878904258664, + "compression/movement_sparsity/model_sparsity": 0.8910872875621401, + "compression_loss": 158.1495819091797, + "distillation_loss": 5.241825103759766, + "epoch": 1.75, + "learning_rate": 4.122992392223162e-05, + "loss": 164.0335, + "step": 2075, + "task_loss": 2.4045944213867188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4774998866367737, + "compression/movement_sparsity/importance_threshold": -7.065101965637866e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9229345219152846, + "compression/movement_sparsity/model_sparsity": 0.8912288818088056, + "compression_loss": 158.1731414794922, + "distillation_loss": 5.257561683654785, + "epoch": 1.75, + "learning_rate": 4.1225697379543537e-05, + "loss": 163.999, + "step": 2076, + "task_loss": 2.453101396560669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4777302614259142, + "compression/movement_sparsity/importance_threshold": -6.992763602300944e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9230235715991892, + "compression/movement_sparsity/model_sparsity": 0.8913148723621191, + "compression_loss": 158.19631958007812, + "distillation_loss": 9.051553726196289, + "epoch": 1.76, + "learning_rate": 4.122147083685545e-05, + "loss": 164.839, + "step": 2077, + "task_loss": 3.787735939025879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.47795905831173, + "compression/movement_sparsity/importance_threshold": -6.920920705262481e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9230572692969282, + "compression/movement_sparsity/model_sparsity": 0.8913474124402745, + "compression_loss": 158.21945190429688, + "distillation_loss": 6.767080307006836, + "epoch": 1.76, + "learning_rate": 4.1217244294167376e-05, + "loss": 164.7223, + "step": 2078, + "task_loss": 2.7486073970794678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4781862827165693, + "compression/movement_sparsity/importance_threshold": -6.849571571889042e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.923190748429444, + "compression/movement_sparsity/model_sparsity": 0.8914763061539587, + "compression_loss": 158.242431640625, + "distillation_loss": 5.395778656005859, + "epoch": 1.76, + "learning_rate": 4.1213017751479296e-05, + "loss": 164.27, + "step": 2079, + "task_loss": 1.5669970512390137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.47841194006278, + "compression/movement_sparsity/importance_threshold": -6.778714499547367e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9234242713284248, + "compression/movement_sparsity/model_sparsity": 0.891701806822959, + "compression_loss": 158.26522827148438, + "distillation_loss": 5.859036445617676, + "epoch": 1.76, + "learning_rate": 4.120879120879121e-05, + "loss": 164.1275, + "step": 2080, + "task_loss": 2.150235652923584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4786360357727106, + "compression/movement_sparsity/importance_threshold": -6.708347785604023e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9235100418662295, + "compression/movement_sparsity/model_sparsity": 0.891784630878929, + "compression_loss": 158.28778076171875, + "distillation_loss": 7.4065937995910645, + "epoch": 1.76, + "learning_rate": 4.120456466610313e-05, + "loss": 164.3823, + "step": 2081, + "task_loss": 3.0152347087860107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4788585752687093, + "compression/movement_sparsity/importance_threshold": -6.638469727425749e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236629693161594, + "compression/movement_sparsity/model_sparsity": 0.891932304800494, + "compression_loss": 158.31011962890625, + "distillation_loss": 7.203195571899414, + "epoch": 1.76, + "learning_rate": 4.120033812341505e-05, + "loss": 164.8392, + "step": 2082, + "task_loss": 3.356092691421509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4790795639731242, + "compression/movement_sparsity/importance_threshold": -6.569078622379286e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237899378531461, + "compression/movement_sparsity/model_sparsity": 0.8920549115776343, + "compression_loss": 158.33250427246094, + "distillation_loss": 6.737878322601318, + "epoch": 1.76, + "learning_rate": 4.119611158072697e-05, + "loss": 165.0983, + "step": 2083, + "task_loss": 3.8402137756347656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4792990073083037, + "compression/movement_sparsity/importance_threshold": -6.5001727678312e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238590026320931, + "compression/movement_sparsity/model_sparsity": 0.8921216037689563, + "compression_loss": 158.35467529296875, + "distillation_loss": 7.76764440536499, + "epoch": 1.76, + "learning_rate": 4.119188503803889e-05, + "loss": 166.1562, + "step": 2084, + "task_loss": 3.0231523513793945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4795169106965955, + "compression/movement_sparsity/importance_threshold": -6.431750461148317e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9239815949995573, + "compression/movement_sparsity/model_sparsity": 0.8922399847114599, + "compression_loss": 158.37669372558594, + "distillation_loss": 5.456912994384766, + "epoch": 1.76, + "learning_rate": 4.118765849535081e-05, + "loss": 164.4382, + "step": 2085, + "task_loss": 3.8630337715148926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4797332795603482, + "compression/movement_sparsity/importance_threshold": -6.363809999697118e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241175305106062, + "compression/movement_sparsity/model_sparsity": 0.8923712504195177, + "compression_loss": 158.39859008789062, + "distillation_loss": 6.736979961395264, + "epoch": 1.76, + "learning_rate": 4.118343195266273e-05, + "loss": 165.563, + "step": 2086, + "task_loss": 3.4097113609313965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4799481193219097, + "compression/movement_sparsity/importance_threshold": -6.296349680844342e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241838288826616, + "compression/movement_sparsity/model_sparsity": 0.8924352712385354, + "compression_loss": 158.42047119140625, + "distillation_loss": 7.493833065032959, + "epoch": 1.76, + "learning_rate": 4.117920540997464e-05, + "loss": 164.8506, + "step": 2087, + "task_loss": 2.2955262660980225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4801614354036285, + "compression/movement_sparsity/importance_threshold": -6.22936780195673e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9243525439305414, + "compression/movement_sparsity/model_sparsity": 0.8925981904054924, + "compression_loss": 158.44212341308594, + "distillation_loss": 5.537937164306641, + "epoch": 1.76, + "learning_rate": 4.117497886728656e-05, + "loss": 164.6803, + "step": 2088, + "task_loss": 2.4924726486206055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.480373233227853, + "compression/movement_sparsity/importance_threshold": -6.16286266040076e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.924466777456493, + "compression/movement_sparsity/model_sparsity": 0.892708499658404, + "compression_loss": 158.46356201171875, + "distillation_loss": 7.666211128234863, + "epoch": 1.77, + "learning_rate": 4.117075232459848e-05, + "loss": 165.5147, + "step": 2089, + "task_loss": 3.7078964710235596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4805835182169307, + "compression/movement_sparsity/importance_threshold": -6.096832553543433e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9245956300119661, + "compression/movement_sparsity/model_sparsity": 0.8928329257321999, + "compression_loss": 158.48486328125, + "distillation_loss": 7.490076065063477, + "epoch": 1.77, + "learning_rate": 4.11665257819104e-05, + "loss": 165.6964, + "step": 2090, + "task_loss": 3.988311767578125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4807922957932103, + "compression/movement_sparsity/importance_threshold": -6.0312757787511426e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247157183042269, + "compression/movement_sparsity/model_sparsity": 0.8929488886221867, + "compression_loss": 158.50584411621094, + "distillation_loss": 9.895221710205078, + "epoch": 1.77, + "learning_rate": 4.116229923922232e-05, + "loss": 165.6142, + "step": 2091, + "task_loss": 3.813565731048584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4809995713790398, + "compression/movement_sparsity/importance_threshold": -5.966190633390628e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9248041002347439, + "compression/movement_sparsity/model_sparsity": 0.8930342343614958, + "compression_loss": 158.52684020996094, + "distillation_loss": 6.144832611083984, + "epoch": 1.77, + "learning_rate": 4.115807269653424e-05, + "loss": 165.2048, + "step": 2092, + "task_loss": 3.0759212970733643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4812053503967673, + "compression/movement_sparsity/importance_threshold": -5.901575414828629e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9248985634907553, + "compression/movement_sparsity/model_sparsity": 0.8931254525140602, + "compression_loss": 158.54759216308594, + "distillation_loss": 6.5753021240234375, + "epoch": 1.77, + "learning_rate": 4.115384615384615e-05, + "loss": 163.7794, + "step": 2093, + "task_loss": 2.998349905014038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4814096382687416, + "compression/movement_sparsity/importance_threshold": -5.8374284204317124e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9250577392045264, + "compression/movement_sparsity/model_sparsity": 0.8932791600523815, + "compression_loss": 158.5681610107422, + "distillation_loss": 5.708560943603516, + "epoch": 1.77, + "learning_rate": 4.114961961115807e-05, + "loss": 164.6118, + "step": 2094, + "task_loss": 2.953666925430298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.48161244041731, + "compression/movement_sparsity/importance_threshold": -5.773747947566705e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9250696991446652, + "compression/movement_sparsity/model_sparsity": 0.8932907091317834, + "compression_loss": 158.58865356445312, + "distillation_loss": 6.482016563415527, + "epoch": 1.77, + "learning_rate": 4.114539306847e-05, + "loss": 164.4768, + "step": 2095, + "task_loss": 3.642782211303711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4818137622648213, + "compression/movement_sparsity/importance_threshold": -5.710532293600173e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9252270385366204, + "compression/movement_sparsity/model_sparsity": 0.8934426434315924, + "compression_loss": 158.6089630126953, + "distillation_loss": 6.840619087219238, + "epoch": 1.77, + "learning_rate": 4.114116652578191e-05, + "loss": 164.3295, + "step": 2096, + "task_loss": 2.9389097690582275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4820136092336238, + "compression/movement_sparsity/importance_threshold": -5.647779755898769e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9252998594283726, + "compression/movement_sparsity/model_sparsity": 0.8935129627016896, + "compression_loss": 158.62913513183594, + "distillation_loss": 6.588991641998291, + "epoch": 1.77, + "learning_rate": 4.113693998309383e-05, + "loss": 164.4038, + "step": 2097, + "task_loss": 3.1686906814575195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4822119867460652, + "compression/movement_sparsity/importance_threshold": -5.5854886318291476e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9254340778592819, + "compression/movement_sparsity/model_sparsity": 0.893642570316593, + "compression_loss": 158.64907836914062, + "distillation_loss": 7.505667209625244, + "epoch": 1.77, + "learning_rate": 4.113271344040575e-05, + "loss": 165.3356, + "step": 2098, + "task_loss": 2.816650390625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4824089002244938, + "compression/movement_sparsity/importance_threshold": -5.5236572187581344e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9254680140403735, + "compression/movement_sparsity/model_sparsity": 0.8936753406854643, + "compression_loss": 158.6689453125, + "distillation_loss": 5.966259956359863, + "epoch": 1.77, + "learning_rate": 4.1128486897717663e-05, + "loss": 164.5278, + "step": 2099, + "task_loss": 3.4562697410583496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4826043550912582, + "compression/movement_sparsity/importance_threshold": -5.4622838140522094e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9255901890619705, + "compression/movement_sparsity/model_sparsity": 0.8937933186192151, + "compression_loss": 158.68865966796875, + "distillation_loss": 5.01706600189209, + "epoch": 1.77, + "learning_rate": 4.112426035502959e-05, + "loss": 164.7906, + "step": 2100, + "task_loss": 1.6183031797409058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4827983567687062, + "compression/movement_sparsity/importance_threshold": -5.4013667150781125e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9255869456883736, + "compression/movement_sparsity/model_sparsity": 0.8937901866654789, + "compression_loss": 158.70819091796875, + "distillation_loss": 7.5991363525390625, + "epoch": 1.78, + "learning_rate": 4.112003381234151e-05, + "loss": 164.6592, + "step": 2101, + "task_loss": 4.006626605987549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.482990910679186, + "compression/movement_sparsity/importance_threshold": -5.3409042192026704e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9256177696617123, + "compression/movement_sparsity/model_sparsity": 0.8938199517405079, + "compression_loss": 158.72752380371094, + "distillation_loss": 6.564167022705078, + "epoch": 1.78, + "learning_rate": 4.111580726965343e-05, + "loss": 164.2414, + "step": 2102, + "task_loss": 3.2785227298736572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.483182022245046, + "compression/movement_sparsity/importance_threshold": -5.280894623792276e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9256572148082518, + "compression/movement_sparsity/model_sparsity": 0.8938580418249162, + "compression_loss": 158.74671936035156, + "distillation_loss": 6.228639602661133, + "epoch": 1.78, + "learning_rate": 4.111158072696534e-05, + "loss": 164.8619, + "step": 2103, + "task_loss": 3.1832244396209717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4833716968886344, + "compression/movement_sparsity/importance_threshold": -5.221336226213843e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9256880984024286, + "compression/movement_sparsity/model_sparsity": 0.893887864472624, + "compression_loss": 158.76576232910156, + "distillation_loss": 7.8237504959106445, + "epoch": 1.78, + "learning_rate": 4.110735418427726e-05, + "loss": 165.655, + "step": 2104, + "task_loss": 3.245042324066162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4835599400322992, + "compression/movement_sparsity/importance_threshold": -5.162227323833937e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.925858792862136, + "compression/movement_sparsity/model_sparsity": 0.8940526950525229, + "compression_loss": 158.78469848632812, + "distillation_loss": 8.02081298828125, + "epoch": 1.78, + "learning_rate": 4.110312764158918e-05, + "loss": 166.3767, + "step": 2105, + "task_loss": 4.341396808624268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4837467570983887, + "compression/movement_sparsity/importance_threshold": -5.1035662140192124e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.925900253193006, + "compression/movement_sparsity/model_sparsity": 0.8940927310934805, + "compression_loss": 158.80349731445312, + "distillation_loss": 4.749598503112793, + "epoch": 1.78, + "learning_rate": 4.10989010989011e-05, + "loss": 164.0564, + "step": 2106, + "task_loss": 3.175304889678955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4839321535092513, + "compression/movement_sparsity/importance_threshold": -5.0453511941363216e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9260122688237772, + "compression/movement_sparsity/model_sparsity": 0.8942008986427344, + "compression_loss": 158.82200622558594, + "distillation_loss": 5.590914249420166, + "epoch": 1.78, + "learning_rate": 4.109467455621302e-05, + "loss": 164.4343, + "step": 2107, + "task_loss": 3.014348030090332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4841161346872347, + "compression/movement_sparsity/importance_threshold": -4.9875805615520914e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9261388796357348, + "compression/movement_sparsity/model_sparsity": 0.8943231599838009, + "compression_loss": 158.84027099609375, + "distillation_loss": 7.44611120223999, + "epoch": 1.78, + "learning_rate": 4.109044801352494e-05, + "loss": 165.0043, + "step": 2108, + "task_loss": 4.226390361785889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4842987060546875, + "compression/movement_sparsity/importance_threshold": -4.9302526136330016e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9263273530293873, + "compression/movement_sparsity/model_sparsity": 0.8945051587365694, + "compression_loss": 158.85853576660156, + "distillation_loss": 6.763064384460449, + "epoch": 1.78, + "learning_rate": 4.1086221470836854e-05, + "loss": 165.3203, + "step": 2109, + "task_loss": 4.198666095733643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4844798730339577, + "compression/movement_sparsity/importance_threshold": -4.873365647745879e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9264239507114054, + "compression/movement_sparsity/model_sparsity": 0.894598437991041, + "compression_loss": 158.87661743164062, + "distillation_loss": 6.2183685302734375, + "epoch": 1.78, + "learning_rate": 4.1081994928148774e-05, + "loss": 165.4624, + "step": 2110, + "task_loss": 2.8930840492248535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4846596410473936, + "compression/movement_sparsity/importance_threshold": -4.8169179612572895e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9265281440882076, + "compression/movement_sparsity/model_sparsity": 0.8946990520048137, + "compression_loss": 158.89450073242188, + "distillation_loss": 5.854990482330322, + "epoch": 1.78, + "learning_rate": 4.1077768385460694e-05, + "loss": 164.8618, + "step": 2111, + "task_loss": 2.8054862022399902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4848380155173435, + "compression/movement_sparsity/importance_threshold": -4.760907851533887e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9266214626241258, + "compression/movement_sparsity/model_sparsity": 0.8947891647619417, + "compression_loss": 158.91212463378906, + "distillation_loss": 6.993115425109863, + "epoch": 1.78, + "learning_rate": 4.107354184277261e-05, + "loss": 165.0282, + "step": 2112, + "task_loss": 3.1597177982330322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4850150018661554, + "compression/movement_sparsity/importance_threshold": -4.7053336159424974e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9267490392936619, + "compression/movement_sparsity/model_sparsity": 0.8949123587804075, + "compression_loss": 158.92962646484375, + "distillation_loss": 7.471574306488037, + "epoch": 1.79, + "learning_rate": 4.106931530008453e-05, + "loss": 165.3545, + "step": 2113, + "task_loss": 2.915780782699585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4851906055161774, + "compression/movement_sparsity/importance_threshold": -4.650193551849688e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9268468770891142, + "compression/movement_sparsity/model_sparsity": 0.8950068355466018, + "compression_loss": 158.94700622558594, + "distillation_loss": 7.544171333312988, + "epoch": 1.79, + "learning_rate": 4.106508875739645e-05, + "loss": 165.4697, + "step": 2114, + "task_loss": 3.156970500946045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.485364831889758, + "compression/movement_sparsity/importance_threshold": -4.595485956622111e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9269344839487349, + "compression/movement_sparsity/model_sparsity": 0.8950914328410843, + "compression_loss": 158.9642791748047, + "distillation_loss": 7.283099174499512, + "epoch": 1.79, + "learning_rate": 4.106086221470837e-05, + "loss": 165.5411, + "step": 2115, + "task_loss": 3.2034311294555664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4855376864092453, + "compression/movement_sparsity/importance_threshold": -4.541209127626421e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.926940636819235, + "compression/movement_sparsity/model_sparsity": 0.8950973743415542, + "compression_loss": 158.9813995361328, + "distillation_loss": 8.349763870239258, + "epoch": 1.79, + "learning_rate": 4.1056635672020285e-05, + "loss": 165.8371, + "step": 2116, + "task_loss": 3.8338661193847656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4857091744969875, + "compression/movement_sparsity/importance_threshold": -4.4873613622293564e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9269877611297319, + "compression/movement_sparsity/model_sparsity": 0.8951428797870142, + "compression_loss": 158.99832153320312, + "distillation_loss": 9.298206329345703, + "epoch": 1.79, + "learning_rate": 4.105240912933221e-05, + "loss": 166.3419, + "step": 2117, + "task_loss": 3.6676175594329834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4858793015753324, + "compression/movement_sparsity/importance_threshold": -4.433940957797658e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9271289194262052, + "compression/movement_sparsity/model_sparsity": 0.8952791888617501, + "compression_loss": 159.0152130126953, + "distillation_loss": 7.369715690612793, + "epoch": 1.79, + "learning_rate": 4.104818258664413e-05, + "loss": 165.7438, + "step": 2118, + "task_loss": 3.514754056930542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4860480730666286, + "compression/movement_sparsity/importance_threshold": -4.380946211697805e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9271045225792223, + "compression/movement_sparsity/model_sparsity": 0.8952556301215144, + "compression_loss": 159.03184509277344, + "distillation_loss": 6.515574932098389, + "epoch": 1.79, + "learning_rate": 4.1043956043956045e-05, + "loss": 165.6759, + "step": 2119, + "task_loss": 3.5238170623779297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4862154943932244, + "compression/movement_sparsity/importance_threshold": -4.328375421296625e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9272547551672665, + "compression/movement_sparsity/model_sparsity": 0.8954007017579898, + "compression_loss": 159.0485076904297, + "distillation_loss": 5.713984489440918, + "epoch": 1.79, + "learning_rate": 4.1039729501267964e-05, + "loss": 165.5071, + "step": 2120, + "task_loss": 2.8248162269592285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4863815709774677, + "compression/movement_sparsity/importance_threshold": -4.2762268839606835e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9274026744667894, + "compression/movement_sparsity/model_sparsity": 0.895543539574521, + "compression_loss": 159.0647735595703, + "distillation_loss": 6.610176086425781, + "epoch": 1.79, + "learning_rate": 4.1035502958579884e-05, + "loss": 166.9932, + "step": 2121, + "task_loss": 3.3706016540527344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.486546308241707, + "compression/movement_sparsity/importance_threshold": -4.224498897056721e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9275124125815423, + "compression/movement_sparsity/model_sparsity": 0.8956495078474382, + "compression_loss": 159.08111572265625, + "distillation_loss": 5.752680778503418, + "epoch": 1.79, + "learning_rate": 4.1031276415891804e-05, + "loss": 165.367, + "step": 2122, + "task_loss": 3.135019302368164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4867097116082901, + "compression/movement_sparsity/importance_threshold": -4.1731897579514766e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9276375924933835, + "compression/movement_sparsity/model_sparsity": 0.8957703874442093, + "compression_loss": 159.09716796875, + "distillation_loss": 7.08063268661499, + "epoch": 1.79, + "learning_rate": 4.1027049873203724e-05, + "loss": 165.3837, + "step": 2123, + "task_loss": 3.801912307739258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4868717864995658, + "compression/movement_sparsity/importance_threshold": -4.122297764011431e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9278611348640531, + "compression/movement_sparsity/model_sparsity": 0.8959862504467496, + "compression_loss": 159.1132354736328, + "distillation_loss": 5.206726551055908, + "epoch": 1.79, + "learning_rate": 4.102282333051564e-05, + "loss": 165.7805, + "step": 2124, + "task_loss": 3.422966480255127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4870325383378815, + "compression/movement_sparsity/importance_threshold": -4.0718212126034103e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9279742594424145, + "compression/movement_sparsity/model_sparsity": 0.8960954888478324, + "compression_loss": 159.12901306152344, + "distillation_loss": 6.676248073577881, + "epoch": 1.8, + "learning_rate": 4.1018596787827556e-05, + "loss": 164.8561, + "step": 2125, + "task_loss": 3.43192458152771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.487191972545586, + "compression/movement_sparsity/importance_threshold": -4.021758401094068e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9279257280801365, + "compression/movement_sparsity/model_sparsity": 0.8960486246871486, + "compression_loss": 159.1446990966797, + "distillation_loss": 6.5509114265441895, + "epoch": 1.8, + "learning_rate": 4.1014370245139476e-05, + "loss": 164.9735, + "step": 2126, + "task_loss": 3.757880449295044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.487350094545027, + "compression/movement_sparsity/importance_threshold": -3.972107626850057e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9280090780119112, + "compression/movement_sparsity/model_sparsity": 0.8961291112923525, + "compression_loss": 159.16030883789062, + "distillation_loss": 5.604918003082275, + "epoch": 1.8, + "learning_rate": 4.1010143702451396e-05, + "loss": 164.6136, + "step": 2127, + "task_loss": 3.319204330444336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4875069097585532, + "compression/movement_sparsity/importance_threshold": -3.9228671872380307e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9281407565951141, + "compression/movement_sparsity/model_sparsity": 0.8962562663111315, + "compression_loss": 159.17579650878906, + "distillation_loss": 6.250582695007324, + "epoch": 1.8, + "learning_rate": 4.1005917159763316e-05, + "loss": 165.7747, + "step": 2128, + "task_loss": 3.2477340698242188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4876624236085125, + "compression/movement_sparsity/importance_threshold": -3.874035379624642e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9283059063168707, + "compression/movement_sparsity/model_sparsity": 0.896415742631886, + "compression_loss": 159.19119262695312, + "distillation_loss": 7.016054153442383, + "epoch": 1.8, + "learning_rate": 4.1001690617075235e-05, + "loss": 165.5315, + "step": 2129, + "task_loss": 2.550827980041504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4878166415172533, + "compression/movement_sparsity/importance_threshold": -3.825610501376631e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9283790253128138, + "compression/movement_sparsity/model_sparsity": 0.8964863497653781, + "compression_loss": 159.20648193359375, + "distillation_loss": 5.557750701904297, + "epoch": 1.8, + "learning_rate": 4.0997464074387155e-05, + "loss": 164.8011, + "step": 2130, + "task_loss": 2.319732666015625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4879695689071237, + "compression/movement_sparsity/importance_threshold": -3.7775908498606514e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9284869628782537, + "compression/movement_sparsity/model_sparsity": 0.8965905793433903, + "compression_loss": 159.22161865234375, + "distillation_loss": 4.993691444396973, + "epoch": 1.8, + "learning_rate": 4.0993237531699075e-05, + "loss": 165.5109, + "step": 2131, + "task_loss": 2.7405595779418945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4881212112004718, + "compression/movement_sparsity/importance_threshold": -3.7299747224433556e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9285709567150807, + "compression/movement_sparsity/model_sparsity": 0.896671687733527, + "compression_loss": 159.23660278320312, + "distillation_loss": 6.839860916137695, + "epoch": 1.8, + "learning_rate": 4.098901098901099e-05, + "loss": 165.1701, + "step": 2132, + "task_loss": 3.1859803199768066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.488271573819646, + "compression/movement_sparsity/importance_threshold": -3.682760416491397e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9287030287958156, + "compression/movement_sparsity/model_sparsity": 0.8967992227319873, + "compression_loss": 159.2515106201172, + "distillation_loss": 6.459246635437012, + "epoch": 1.8, + "learning_rate": 4.098478444632291e-05, + "loss": 164.8118, + "step": 2133, + "task_loss": 2.9032678604125977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4884206621869942, + "compression/movement_sparsity/importance_threshold": -3.635946229371429e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9287462539034955, + "compression/movement_sparsity/model_sparsity": 0.8968409629242425, + "compression_loss": 159.26617431640625, + "distillation_loss": 7.230327129364014, + "epoch": 1.8, + "learning_rate": 4.0980557903634834e-05, + "loss": 165.3036, + "step": 2134, + "task_loss": 2.9536778926849365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4885684817248646, + "compression/movement_sparsity/importance_threshold": -3.589530458450278e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9288265750966906, + "compression/movement_sparsity/model_sparsity": 0.8969185248373546, + "compression_loss": 159.28060913085938, + "distillation_loss": 7.201566696166992, + "epoch": 1.8, + "learning_rate": 4.097633136094675e-05, + "loss": 165.8674, + "step": 2135, + "task_loss": 4.045864582061768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4887150378556058, + "compression/movement_sparsity/importance_threshold": -3.5435114010943375e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9288811043152895, + "compression/movement_sparsity/model_sparsity": 0.8969711808095431, + "compression_loss": 159.2950897216797, + "distillation_loss": 6.321192264556885, + "epoch": 1.81, + "learning_rate": 4.097210481825867e-05, + "loss": 165.7875, + "step": 2136, + "task_loss": 3.1612582206726074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4888603360015658, + "compression/movement_sparsity/importance_threshold": -3.497887354670607e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9289498709900456, + "compression/movement_sparsity/model_sparsity": 0.8970375851374701, + "compression_loss": 159.30931091308594, + "distillation_loss": 4.523503303527832, + "epoch": 1.81, + "learning_rate": 4.0967878275570586e-05, + "loss": 164.9672, + "step": 2137, + "task_loss": 2.5860769748687744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4890043815850924, + "compression/movement_sparsity/importance_threshold": -3.45265661654548e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9290225010951155, + "compression/movement_sparsity/model_sparsity": 0.8971077201749947, + "compression_loss": 159.32337951660156, + "distillation_loss": 5.124682426452637, + "epoch": 1.81, + "learning_rate": 4.09636517328825e-05, + "loss": 164.9088, + "step": 2138, + "task_loss": 2.836557149887085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4891471800285343, + "compression/movement_sparsity/importance_threshold": -3.407817484085869e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9291000678055868, + "compression/movement_sparsity/model_sparsity": 0.8971826222303382, + "compression_loss": 159.33731079101562, + "distillation_loss": 7.0748677253723145, + "epoch": 1.81, + "learning_rate": 4.0959425190194426e-05, + "loss": 165.7097, + "step": 2139, + "task_loss": 2.9615187644958496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4892887367542396, + "compression/movement_sparsity/importance_threshold": -3.363368254658168e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9292787514576103, + "compression/movement_sparsity/model_sparsity": 0.8973551675492194, + "compression_loss": 159.35110473632812, + "distillation_loss": 6.316110610961914, + "epoch": 1.81, + "learning_rate": 4.0955198647506346e-05, + "loss": 165.8481, + "step": 2140, + "task_loss": 2.997666835784912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4894290571845565, + "compression/movement_sparsity/importance_threshold": -3.31930722562929e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9294088322023499, + "compression/movement_sparsity/model_sparsity": 0.897480779620202, + "compression_loss": 159.36483764648438, + "distillation_loss": 6.787198066711426, + "epoch": 1.81, + "learning_rate": 4.095097210481826e-05, + "loss": 165.29, + "step": 2141, + "task_loss": 2.8411874771118164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.489568146741833, + "compression/movement_sparsity/importance_threshold": -3.275632694365888e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9294683099505175, + "compression/movement_sparsity/model_sparsity": 0.8975382141247452, + "compression_loss": 159.37843322753906, + "distillation_loss": 6.278470039367676, + "epoch": 1.81, + "learning_rate": 4.094674556213018e-05, + "loss": 165.6805, + "step": 2142, + "task_loss": 3.4164271354675293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4897060108484173, + "compression/movement_sparsity/importance_threshold": -3.2323429582344423e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.929585560290881, + "compression/movement_sparsity/model_sparsity": 0.897651436555213, + "compression_loss": 159.39183044433594, + "distillation_loss": 5.80866813659668, + "epoch": 1.81, + "learning_rate": 4.09425190194421e-05, + "loss": 166.2152, + "step": 2143, + "task_loss": 2.716982364654541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.489842654926658, + "compression/movement_sparsity/importance_threshold": -3.189436314601779e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9296640570864279, + "compression/movement_sparsity/model_sparsity": 0.8977272367443484, + "compression_loss": 159.4051971435547, + "distillation_loss": 5.769543647766113, + "epoch": 1.81, + "learning_rate": 4.093829247675402e-05, + "loss": 164.9926, + "step": 2144, + "task_loss": 3.2212955951690674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4899780843989028, + "compression/movement_sparsity/importance_threshold": -3.146911060834638e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9297567078689586, + "compression/movement_sparsity/model_sparsity": 0.897816704687472, + "compression_loss": 159.41836547851562, + "distillation_loss": 5.428375720977783, + "epoch": 1.81, + "learning_rate": 4.093406593406594e-05, + "loss": 165.3882, + "step": 2145, + "task_loss": 3.019425630569458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4901123046875, + "compression/movement_sparsity/importance_threshold": -3.1047654942994996e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9297966538305387, + "compression/movement_sparsity/model_sparsity": 0.8978552783823837, + "compression_loss": 159.43136596679688, + "distillation_loss": 5.688624382019043, + "epoch": 1.81, + "learning_rate": 4.092983939137786e-05, + "loss": 165.2465, + "step": 2146, + "task_loss": 2.6111912727355957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.490245321214798, + "compression/movement_sparsity/importance_threshold": -3.06299791236319e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9299867250626543, + "compression/movement_sparsity/model_sparsity": 0.8980388200829487, + "compression_loss": 159.44424438476562, + "distillation_loss": 5.891379356384277, + "epoch": 1.81, + "learning_rate": 4.092561284868978e-05, + "loss": 165.4216, + "step": 2147, + "task_loss": 2.8257851600646973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4903771394031449, + "compression/movement_sparsity/importance_threshold": -3.0216066123922754e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9300391556277492, + "compression/movement_sparsity/model_sparsity": 0.8980894494968372, + "compression_loss": 159.45693969726562, + "distillation_loss": 7.017194747924805, + "epoch": 1.82, + "learning_rate": 4.092138630600169e-05, + "loss": 165.6185, + "step": 2148, + "task_loss": 3.9574997425079346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.490507764674889, + "compression/movement_sparsity/importance_threshold": -2.9805898917534962e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9300757985948942, + "compression/movement_sparsity/model_sparsity": 0.8981248336653339, + "compression_loss": 159.469482421875, + "distillation_loss": 6.699184417724609, + "epoch": 1.82, + "learning_rate": 4.091715976331361e-05, + "loss": 165.5337, + "step": 2149, + "task_loss": 3.262906551361084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4906372024523782, + "compression/movement_sparsity/importance_threshold": -2.9399460478134187e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.930150992396006, + "compression/movement_sparsity/model_sparsity": 0.8981974443280543, + "compression_loss": 159.48208618164062, + "distillation_loss": 5.176021099090576, + "epoch": 1.82, + "learning_rate": 4.091293322062553e-05, + "loss": 165.3814, + "step": 2150, + "task_loss": 1.9097723960876465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.490765458157961, + "compression/movement_sparsity/importance_threshold": -2.8996733779388696e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9302515727500145, + "compression/movement_sparsity/model_sparsity": 0.8982945694374812, + "compression_loss": 159.49436950683594, + "distillation_loss": 6.5063300132751465, + "epoch": 1.82, + "learning_rate": 4.090870667793745e-05, + "loss": 165.2179, + "step": 2151, + "task_loss": 3.656248092651367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4908925372139854, + "compression/movement_sparsity/importance_threshold": -2.859770179496502e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9303478484795065, + "compression/movement_sparsity/model_sparsity": 0.8983875377994863, + "compression_loss": 159.5064239501953, + "distillation_loss": 6.440917015075684, + "epoch": 1.82, + "learning_rate": 4.090448013524937e-05, + "loss": 165.1823, + "step": 2152, + "task_loss": 3.414710283279419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4910184450427997, + "compression/movement_sparsity/importance_threshold": -2.8202347498528825e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9304561199216401, + "compression/movement_sparsity/model_sparsity": 0.8984920897845008, + "compression_loss": 159.51849365234375, + "distillation_loss": 7.817091464996338, + "epoch": 1.82, + "learning_rate": 4.090025359256129e-05, + "loss": 166.7656, + "step": 2153, + "task_loss": 4.013523578643799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.491143187066752, + "compression/movement_sparsity/importance_threshold": -2.7810653863748376e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9304956962340235, + "compression/movement_sparsity/model_sparsity": 0.8985303065288028, + "compression_loss": 159.5303497314453, + "distillation_loss": 5.37233304977417, + "epoch": 1.82, + "learning_rate": 4.08960270498732e-05, + "loss": 165.4407, + "step": 2154, + "task_loss": 2.8311851024627686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4912667687081906, + "compression/movement_sparsity/importance_threshold": -2.7422603864287604e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9306233444485654, + "compression/movement_sparsity/model_sparsity": 0.8986535696344834, + "compression_loss": 159.54196166992188, + "distillation_loss": 6.927384376525879, + "epoch": 1.82, + "learning_rate": 4.089180050718512e-05, + "loss": 166.795, + "step": 2155, + "task_loss": 3.552914619445801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4913891953894636, + "compression/movement_sparsity/importance_threshold": -2.703818047381651e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.930682762575895, + "compression/movement_sparsity/model_sparsity": 0.8987109465663475, + "compression_loss": 159.5537109375, + "distillation_loss": 5.755553245544434, + "epoch": 1.82, + "learning_rate": 4.088757396449705e-05, + "loss": 165.1994, + "step": 2156, + "task_loss": 3.149372100830078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4915104725329194, + "compression/movement_sparsity/importance_threshold": -2.665736666599989e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9307303519289297, + "compression/movement_sparsity/model_sparsity": 0.8987569010787035, + "compression_loss": 159.565185546875, + "distillation_loss": 4.5558576583862305, + "epoch": 1.82, + "learning_rate": 4.088334742180896e-05, + "loss": 164.7463, + "step": 2157, + "task_loss": 2.1252498626708984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4916306055609059, + "compression/movement_sparsity/importance_threshold": -2.6280145414504276e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9307655162992878, + "compression/movement_sparsity/model_sparsity": 0.8987908574447617, + "compression_loss": 159.57650756835938, + "distillation_loss": 6.22182035446167, + "epoch": 1.82, + "learning_rate": 4.087912087912088e-05, + "loss": 165.9829, + "step": 2158, + "task_loss": 3.830855369567871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4917495998957713, + "compression/movement_sparsity/importance_threshold": -2.5906499692997936e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9308491762594211, + "compression/movement_sparsity/model_sparsity": 0.8988716434278962, + "compression_loss": 159.58770751953125, + "distillation_loss": 6.640441417694092, + "epoch": 1.82, + "learning_rate": 4.08748943364328e-05, + "loss": 165.9178, + "step": 2159, + "task_loss": 2.7593860626220703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4918674609598641, + "compression/movement_sparsity/importance_threshold": -2.5536412475146535e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9308950604564838, + "compression/movement_sparsity/model_sparsity": 0.8989159513616336, + "compression_loss": 159.5988311767578, + "distillation_loss": 7.90350341796875, + "epoch": 1.83, + "learning_rate": 4.087066779374472e-05, + "loss": 165.9282, + "step": 2160, + "task_loss": 4.193729877471924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4919841941755325, + "compression/movement_sparsity/importance_threshold": -2.5169866734616603e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9310662915037349, + "compression/movement_sparsity/model_sparsity": 0.8990813000956432, + "compression_loss": 159.60995483398438, + "distillation_loss": 4.9071807861328125, + "epoch": 1.83, + "learning_rate": 4.086644125105663e-05, + "loss": 165.5444, + "step": 2161, + "task_loss": 1.8456387519836426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4920998049651244, + "compression/movement_sparsity/importance_threshold": -2.480684544507554e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9310616053058539, + "compression/movement_sparsity/model_sparsity": 0.8990767748830759, + "compression_loss": 159.62083435058594, + "distillation_loss": 8.28921127319336, + "epoch": 1.83, + "learning_rate": 4.086221470836856e-05, + "loss": 165.9566, + "step": 2162, + "task_loss": 4.00072717666626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4922142987509879, + "compression/movement_sparsity/importance_threshold": -2.444733158018901e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9311380153720645, + "compression/movement_sparsity/model_sparsity": 0.8991505600284474, + "compression_loss": 159.6317138671875, + "distillation_loss": 5.234555721282959, + "epoch": 1.83, + "learning_rate": 4.085798816568048e-05, + "loss": 165.4295, + "step": 2163, + "task_loss": 2.0783655643463135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4923276809554715, + "compression/movement_sparsity/importance_threshold": -2.409130811362615e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9312478131076556, + "compression/movement_sparsity/model_sparsity": 0.8992565858740434, + "compression_loss": 159.642578125, + "distillation_loss": 6.005222320556641, + "epoch": 1.83, + "learning_rate": 4.085376162299239e-05, + "loss": 165.5309, + "step": 2164, + "task_loss": 1.9089618921279907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4924399570009235, + "compression/movement_sparsity/importance_threshold": -2.3738758019050017e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9313164009198971, + "compression/movement_sparsity/model_sparsity": 0.8993228174839337, + "compression_loss": 159.6532440185547, + "distillation_loss": 8.237293243408203, + "epoch": 1.83, + "learning_rate": 4.084953508030431e-05, + "loss": 166.4981, + "step": 2165, + "task_loss": 3.315074920654297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4925511323096916, + "compression/movement_sparsity/importance_threshold": -2.3389664270130617e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9313727664603118, + "compression/movement_sparsity/model_sparsity": 0.8993772466946345, + "compression_loss": 159.6637725830078, + "distillation_loss": 4.860881805419922, + "epoch": 1.83, + "learning_rate": 4.084530853761623e-05, + "loss": 165.6496, + "step": 2166, + "task_loss": 2.7041468620300293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4926612123041245, + "compression/movement_sparsity/importance_threshold": -2.3044009840532745e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9314149183929046, + "compression/movement_sparsity/model_sparsity": 0.8994179505786681, + "compression_loss": 159.6742706298828, + "distillation_loss": 5.192461013793945, + "epoch": 1.83, + "learning_rate": 4.084108199492815e-05, + "loss": 165.925, + "step": 2167, + "task_loss": 2.4513585567474365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4927702024065703, + "compression/movement_sparsity/importance_threshold": -2.27017777039238e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9314703538482437, + "compression/movement_sparsity/model_sparsity": 0.899471481655577, + "compression_loss": 159.68455505371094, + "distillation_loss": 5.959852695465088, + "epoch": 1.83, + "learning_rate": 4.083685545224007e-05, + "loss": 165.7661, + "step": 2168, + "task_loss": 2.960866928100586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4928781080393767, + "compression/movement_sparsity/importance_threshold": -2.2362950833970316e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9315436636308689, + "compression/movement_sparsity/model_sparsity": 0.8995422730216418, + "compression_loss": 159.69476318359375, + "distillation_loss": 6.3040618896484375, + "epoch": 1.83, + "learning_rate": 4.083262890955199e-05, + "loss": 165.6965, + "step": 2169, + "task_loss": 3.5666940212249756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4929849346248927, + "compression/movement_sparsity/importance_threshold": -2.202751220433969e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.93166289338306, + "compression/movement_sparsity/model_sparsity": 0.8996574068650515, + "compression_loss": 159.7049560546875, + "distillation_loss": 6.221062660217285, + "epoch": 1.83, + "learning_rate": 4.0828402366863904e-05, + "loss": 166.3867, + "step": 2170, + "task_loss": 3.006350040435791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.493090687585466, + "compression/movement_sparsity/importance_threshold": -2.1695444788697588e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9317134757021713, + "compression/movement_sparsity/model_sparsity": 0.8997062515258919, + "compression_loss": 159.71481323242188, + "distillation_loss": 5.142766952514648, + "epoch": 1.83, + "learning_rate": 4.0824175824175824e-05, + "loss": 165.5593, + "step": 2171, + "task_loss": 2.453266143798828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4931953723434446, + "compression/movement_sparsity/importance_threshold": -2.1366731560711408e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9317848656938073, + "compression/movement_sparsity/model_sparsity": 0.8997751890516938, + "compression_loss": 159.72474670410156, + "distillation_loss": 5.263578414916992, + "epoch": 1.84, + "learning_rate": 4.0819949281487743e-05, + "loss": 165.2598, + "step": 2172, + "task_loss": 2.365400552749634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4932989943211772, + "compression/movement_sparsity/importance_threshold": -2.1041355494047682e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9317053791923465, + "compression/movement_sparsity/model_sparsity": 0.8996984331560874, + "compression_loss": 159.73452758789062, + "distillation_loss": 5.399177551269531, + "epoch": 1.84, + "learning_rate": 4.081572273879967e-05, + "loss": 165.5539, + "step": 2173, + "task_loss": 2.463406801223755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4934015589410117, + "compression/movement_sparsity/importance_threshold": -2.0719299562372943e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9317769957431675, + "compression/movement_sparsity/model_sparsity": 0.8997675894580695, + "compression_loss": 159.74412536621094, + "distillation_loss": 6.036779403686523, + "epoch": 1.84, + "learning_rate": 4.081149619611158e-05, + "loss": 166.406, + "step": 2174, + "task_loss": 3.9989817142486572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4935030716252964, + "compression/movement_sparsity/importance_threshold": -2.040054673935459e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9318264810388563, + "compression/movement_sparsity/model_sparsity": 0.8998153747816168, + "compression_loss": 159.7536163330078, + "distillation_loss": 6.504644393920898, + "epoch": 1.84, + "learning_rate": 4.08072696534235e-05, + "loss": 166.4019, + "step": 2175, + "task_loss": 3.4047913551330566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4936035377963792, + "compression/movement_sparsity/importance_threshold": -2.0085079998659154e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9318922666717034, + "compression/movement_sparsity/model_sparsity": 0.8998789004755953, + "compression_loss": 159.7630615234375, + "distillation_loss": 5.924861907958984, + "epoch": 1.84, + "learning_rate": 4.080304311073542e-05, + "loss": 165.715, + "step": 2176, + "task_loss": 3.7973406314849854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4937029628766088, + "compression/movement_sparsity/importance_threshold": -1.97728823139523e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.931958100001221, + "compression/movement_sparsity/model_sparsity": 0.899942472227717, + "compression_loss": 159.77247619628906, + "distillation_loss": 7.481154441833496, + "epoch": 1.84, + "learning_rate": 4.0798816568047335e-05, + "loss": 166.6641, + "step": 2177, + "task_loss": 3.8168303966522217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.493801352288333, + "compression/movement_sparsity/importance_threshold": -1.9463936658901428e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9319233291283948, + "compression/movement_sparsity/model_sparsity": 0.89990889584134, + "compression_loss": 159.78163146972656, + "distillation_loss": 6.10257625579834, + "epoch": 1.84, + "learning_rate": 4.0794590025359255e-05, + "loss": 166.901, + "step": 2178, + "task_loss": 3.3615942001342773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4938987114539004, + "compression/movement_sparsity/importance_threshold": -1.9158226007173935e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9319617130240146, + "compression/movement_sparsity/model_sparsity": 0.8999459611320626, + "compression_loss": 159.79087829589844, + "distillation_loss": 9.399238586425781, + "epoch": 1.84, + "learning_rate": 4.079036348267118e-05, + "loss": 166.5547, + "step": 2179, + "task_loss": 4.1643900871276855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4939950457956588, + "compression/movement_sparsity/importance_threshold": -1.885573333243549e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9320233848190274, + "compression/movement_sparsity/model_sparsity": 0.900005514311192, + "compression_loss": 159.7999725341797, + "distillation_loss": 5.253705024719238, + "epoch": 1.84, + "learning_rate": 4.0786136939983095e-05, + "loss": 165.3041, + "step": 2180, + "task_loss": 3.240295171737671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4940903607359564, + "compression/movement_sparsity/importance_threshold": -1.8556441608354353e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9320854620357397, + "compression/movement_sparsity/model_sparsity": 0.9000654589845384, + "compression_loss": 159.80892944335938, + "distillation_loss": 5.829642295837402, + "epoch": 1.84, + "learning_rate": 4.0781910397295014e-05, + "loss": 166.2426, + "step": 2181, + "task_loss": 3.580606698989868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4941846616971417, + "compression/movement_sparsity/importance_threshold": -1.826033380859446e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9321952359229955, + "compression/movement_sparsity/model_sparsity": 0.9001714618010629, + "compression_loss": 159.8177032470703, + "distillation_loss": 8.56905746459961, + "epoch": 1.84, + "learning_rate": 4.0777683854606934e-05, + "loss": 166.9621, + "step": 2182, + "task_loss": 3.6865451335906982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4942779541015625, + "compression/movement_sparsity/importance_threshold": -1.7967392906825808e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9322956254903217, + "compression/movement_sparsity/model_sparsity": 0.9002684026779172, + "compression_loss": 159.82647705078125, + "distillation_loss": 5.72161865234375, + "epoch": 1.84, + "learning_rate": 4.077345731191885e-05, + "loss": 166.4137, + "step": 2183, + "task_loss": 3.208189010620117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4943702433715673, + "compression/movement_sparsity/importance_threshold": -1.7677601876713196e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9323374435462207, + "compression/movement_sparsity/model_sparsity": 0.9003087841549486, + "compression_loss": 159.83514404296875, + "distillation_loss": 7.543937683105469, + "epoch": 1.85, + "learning_rate": 4.0769230769230773e-05, + "loss": 165.4243, + "step": 2184, + "task_loss": 3.0690970420837402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4944615349295043, + "compression/movement_sparsity/importance_threshold": -1.7390943691923157e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9324379881277264, + "compression/movement_sparsity/model_sparsity": 0.9004058747207682, + "compression_loss": 159.8437042236328, + "distillation_loss": 6.812084674835205, + "epoch": 1.85, + "learning_rate": 4.076500422654269e-05, + "loss": 165.7827, + "step": 2185, + "task_loss": 2.6723556518554688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4945518341977215, + "compression/movement_sparsity/importance_threshold": -1.7107401326123955e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9325237228930282, + "compression/movement_sparsity/model_sparsity": 0.9004886642331309, + "compression_loss": 159.85211181640625, + "distillation_loss": 7.798141956329346, + "epoch": 1.85, + "learning_rate": 4.0760777683854606e-05, + "loss": 166.3794, + "step": 2186, + "task_loss": 3.6036007404327393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4946411465985672, + "compression/movement_sparsity/importance_threshold": -1.682695775298039e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9325968061164684, + "compression/movement_sparsity/model_sparsity": 0.9005592368230156, + "compression_loss": 159.8603973388672, + "distillation_loss": 6.098156929016113, + "epoch": 1.85, + "learning_rate": 4.0756551141166526e-05, + "loss": 166.3632, + "step": 2187, + "task_loss": 1.9365739822387695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4947294775543898, + "compression/movement_sparsity/importance_threshold": -1.6549595946160726e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9327273042070753, + "compression/movement_sparsity/model_sparsity": 0.9006852519027512, + "compression_loss": 159.8687286376953, + "distillation_loss": 7.246183395385742, + "epoch": 1.85, + "learning_rate": 4.0752324598478446e-05, + "loss": 166.2349, + "step": 2188, + "task_loss": 2.444012403488159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.494816832487537, + "compression/movement_sparsity/importance_threshold": -1.627529887933063e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9327448208093324, + "compression/movement_sparsity/model_sparsity": 0.9007021667558334, + "compression_loss": 159.8767547607422, + "distillation_loss": 8.717750549316406, + "epoch": 1.85, + "learning_rate": 4.0748098055790365e-05, + "loss": 166.6618, + "step": 2189, + "task_loss": 3.457547664642334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4949032168203573, + "compression/movement_sparsity/importance_threshold": -1.60040495261575e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9328447572582885, + "compression/movement_sparsity/model_sparsity": 0.9007986700803273, + "compression_loss": 159.8848114013672, + "distillation_loss": 6.791445732116699, + "epoch": 1.85, + "learning_rate": 4.0743871513102285e-05, + "loss": 165.9034, + "step": 2190, + "task_loss": 3.6058197021484375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.494988635975199, + "compression/movement_sparsity/importance_threshold": -1.5735830860307866e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9329121526537665, + "compression/movement_sparsity/model_sparsity": 0.9008637502366381, + "compression_loss": 159.892822265625, + "distillation_loss": 6.742925643920898, + "epoch": 1.85, + "learning_rate": 4.0739644970414205e-05, + "loss": 166.5036, + "step": 2191, + "task_loss": 3.7004892826080322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4950730953744102, + "compression/movement_sparsity/importance_threshold": -1.5470625855448263e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9329478893841712, + "compression/movement_sparsity/model_sparsity": 0.9008982593004143, + "compression_loss": 159.9007110595703, + "distillation_loss": 8.056020736694336, + "epoch": 1.85, + "learning_rate": 4.0735418427726125e-05, + "loss": 166.5636, + "step": 2192, + "task_loss": 2.8520755767822266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.495156600440339, + "compression/movement_sparsity/importance_threshold": -1.5208417485246088e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.933060441602486, + "compression/movement_sparsity/model_sparsity": 0.901006945003779, + "compression_loss": 159.9083251953125, + "distillation_loss": 4.648472785949707, + "epoch": 1.85, + "learning_rate": 4.073119188503804e-05, + "loss": 165.8134, + "step": 2193, + "task_loss": 2.1897342205047607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4952391565953338, + "compression/movement_sparsity/importance_threshold": -1.4949188723367007e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9331152689252757, + "compression/movement_sparsity/model_sparsity": 0.9010598888393623, + "compression_loss": 159.9159393310547, + "distillation_loss": 6.803522109985352, + "epoch": 1.85, + "learning_rate": 4.072696534234996e-05, + "loss": 166.1004, + "step": 2194, + "task_loss": 2.787987470626831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4953207692617425, + "compression/movement_sparsity/importance_threshold": -1.4692922543478419e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.933110737741574, + "compression/movement_sparsity/model_sparsity": 0.9010555133157604, + "compression_loss": 159.92356872558594, + "distillation_loss": 6.17678165435791, + "epoch": 1.85, + "learning_rate": 4.072273879966188e-05, + "loss": 165.7399, + "step": 2195, + "task_loss": 2.9803643226623535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4954014438619134, + "compression/movement_sparsity/importance_threshold": -1.4439601919246856e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9331464148511406, + "compression/movement_sparsity/model_sparsity": 0.9010899648068577, + "compression_loss": 159.93101501464844, + "distillation_loss": 4.351841449737549, + "epoch": 1.86, + "learning_rate": 4.07185122569738e-05, + "loss": 165.9034, + "step": 2196, + "task_loss": 2.2298755645751953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.495481185818195, + "compression/movement_sparsity/importance_threshold": -1.4189209824339716e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.933213893715792, + "compression/movement_sparsity/model_sparsity": 0.901155125564919, + "compression_loss": 159.93820190429688, + "distillation_loss": 5.4615559577941895, + "epoch": 1.86, + "learning_rate": 4.0714285714285717e-05, + "loss": 165.991, + "step": 2197, + "task_loss": 2.954812526702881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4955600005529348, + "compression/movement_sparsity/importance_threshold": -1.3941729232421797e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9333205196227918, + "compression/movement_sparsity/model_sparsity": 0.9012580885439937, + "compression_loss": 159.94540405273438, + "distillation_loss": 7.4501543045043945, + "epoch": 1.86, + "learning_rate": 4.0710059171597636e-05, + "loss": 165.7139, + "step": 2198, + "task_loss": 3.646406412124634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4956378934884818, + "compression/movement_sparsity/importance_threshold": -1.3697143117162233e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9334077210607129, + "compression/movement_sparsity/model_sparsity": 0.9013422943442592, + "compression_loss": 159.95249938964844, + "distillation_loss": 7.007129669189453, + "epoch": 1.86, + "learning_rate": 4.070583262890955e-05, + "loss": 165.9149, + "step": 2199, + "task_loss": 3.753594160079956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4957148700471836, + "compression/movement_sparsity/importance_threshold": -1.3455434452226689e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9333678227958033, + "compression/movement_sparsity/model_sparsity": 0.9013037667074907, + "compression_loss": 159.95947265625, + "distillation_loss": 5.433218955993652, + "epoch": 1.86, + "learning_rate": 4.070160608622147e-05, + "loss": 166.3544, + "step": 2200, + "task_loss": 2.5983219146728516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.495790935651389, + "compression/movement_sparsity/importance_threshold": -1.3216586211281696e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9334682719839679, + "compression/movement_sparsity/model_sparsity": 0.9014007651570239, + "compression_loss": 159.96640014648438, + "distillation_loss": 6.349350452423096, + "epoch": 1.86, + "learning_rate": 4.0697379543533395e-05, + "loss": 165.8351, + "step": 2201, + "task_loss": 3.0378222465515137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4958660957234455, + "compression/movement_sparsity/importance_threshold": -1.2980581367993786e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9335358820144631, + "compression/movement_sparsity/model_sparsity": 0.9014660525749789, + "compression_loss": 159.97323608398438, + "distillation_loss": 6.311150074005127, + "epoch": 1.86, + "learning_rate": 4.0693153000845315e-05, + "loss": 166.0686, + "step": 2202, + "task_loss": 3.006481885910034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4959403556857018, + "compression/movement_sparsity/importance_threshold": -1.2747402896029492e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.933615559302606, + "compression/movement_sparsity/model_sparsity": 0.9015429927031581, + "compression_loss": 159.9799346923828, + "distillation_loss": 6.676658630371094, + "epoch": 1.86, + "learning_rate": 4.068892645815723e-05, + "loss": 166.5747, + "step": 2203, + "task_loss": 3.5015902519226074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4960137209605058, + "compression/movement_sparsity/importance_threshold": -1.2517033769057079e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9336323962273079, + "compression/movement_sparsity/model_sparsity": 0.9015592512277, + "compression_loss": 159.98660278320312, + "distillation_loss": 6.951223850250244, + "epoch": 1.86, + "learning_rate": 4.068469991546915e-05, + "loss": 166.7534, + "step": 2204, + "task_loss": 4.691531658172607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4960861969702057, + "compression/movement_sparsity/importance_threshold": -1.2289456960742212e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9337604498635493, + "compression/movement_sparsity/model_sparsity": 0.9016829058275976, + "compression_loss": 159.99314880371094, + "distillation_loss": 7.449137210845947, + "epoch": 1.86, + "learning_rate": 4.068047337278107e-05, + "loss": 166.2661, + "step": 2205, + "task_loss": 3.614295721054077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.49615778913715, + "compression/movement_sparsity/importance_threshold": -1.2064655444751424e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9339130315126178, + "compression/movement_sparsity/model_sparsity": 0.9018302458276245, + "compression_loss": 159.99954223632812, + "distillation_loss": 7.655429840087891, + "epoch": 1.86, + "learning_rate": 4.067624683009299e-05, + "loss": 166.3201, + "step": 2206, + "task_loss": 4.149497032165527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4962285028836866, + "compression/movement_sparsity/importance_threshold": -1.1842612194751245e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9339630653200179, + "compression/movement_sparsity/model_sparsity": 0.9018785608198183, + "compression_loss": 160.0059051513672, + "distillation_loss": 7.039043426513672, + "epoch": 1.87, + "learning_rate": 4.067202028740491e-05, + "loss": 166.503, + "step": 2207, + "task_loss": 3.951432228088379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.496298343632164, + "compression/movement_sparsity/importance_threshold": -1.1623310184409076e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.933984218793404, + "compression/movement_sparsity/model_sparsity": 0.9018989876063179, + "compression_loss": 160.0121612548828, + "distillation_loss": 8.174517631530762, + "epoch": 1.87, + "learning_rate": 4.066779374471683e-05, + "loss": 167.0821, + "step": 2208, + "task_loss": 3.1212151050567627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4963673168049298, + "compression/movement_sparsity/importance_threshold": -1.1406732387391448e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9340117636206429, + "compression/movement_sparsity/model_sparsity": 0.9019255861840033, + "compression_loss": 160.01821899414062, + "distillation_loss": 7.411257743835449, + "epoch": 1.87, + "learning_rate": 4.066356720202874e-05, + "loss": 167.1383, + "step": 2209, + "task_loss": 4.18795919418335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4964354278243328, + "compression/movement_sparsity/importance_threshold": -1.1192861777364893e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9341308264344869, + "compression/movement_sparsity/model_sparsity": 0.9020405588239118, + "compression_loss": 160.0242462158203, + "distillation_loss": 7.036897659301758, + "epoch": 1.87, + "learning_rate": 4.065934065934066e-05, + "loss": 166.6799, + "step": 2210, + "task_loss": 3.060356616973877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4965026821127212, + "compression/movement_sparsity/importance_threshold": -1.0981681327995943e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9341863572831671, + "compression/movement_sparsity/model_sparsity": 0.9020941820171069, + "compression_loss": 160.0299835205078, + "distillation_loss": 7.661181926727295, + "epoch": 1.87, + "learning_rate": 4.065511411665258e-05, + "loss": 166.3157, + "step": 2211, + "task_loss": 2.883307933807373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4965690850924427, + "compression/movement_sparsity/importance_threshold": -1.0773174012952864e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9342052332405347, + "compression/movement_sparsity/model_sparsity": 0.9021124095272697, + "compression_loss": 160.0356903076172, + "distillation_loss": 6.834277629852295, + "epoch": 1.87, + "learning_rate": 4.06508875739645e-05, + "loss": 166.9267, + "step": 2212, + "task_loss": 2.8401851654052734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4966346421858459, + "compression/movement_sparsity/importance_threshold": -1.0567322805899587e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9342909083849984, + "compression/movement_sparsity/model_sparsity": 0.9021951414669535, + "compression_loss": 160.04135131835938, + "distillation_loss": 6.103752613067627, + "epoch": 1.87, + "learning_rate": 4.064666103127642e-05, + "loss": 165.3373, + "step": 2213, + "task_loss": 3.3360533714294434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4966993588152788, + "compression/movement_sparsity/importance_threshold": -1.0364110680505245e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9342650567895638, + "compression/movement_sparsity/model_sparsity": 0.9021701779533509, + "compression_loss": 160.04685974121094, + "distillation_loss": 7.086657524108887, + "epoch": 1.87, + "learning_rate": 4.064243448858834e-05, + "loss": 166.2703, + "step": 2214, + "task_loss": 3.6259615421295166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4967632404030895, + "compression/movement_sparsity/importance_threshold": -1.0163520610435503e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.934326382783715, + "compression/movement_sparsity/model_sparsity": 0.9022293972109422, + "compression_loss": 160.0523223876953, + "distillation_loss": 7.847537040710449, + "epoch": 1.87, + "learning_rate": 4.063820794590025e-05, + "loss": 166.6605, + "step": 2215, + "task_loss": 3.4278292655944824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4968262923716265, + "compression/movement_sparsity/importance_threshold": -9.96553556935776e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9344232070249182, + "compression/movement_sparsity/model_sparsity": 0.9023228952415939, + "compression_loss": 160.05772399902344, + "distillation_loss": 7.414052963256836, + "epoch": 1.87, + "learning_rate": 4.063398140321217e-05, + "loss": 166.4403, + "step": 2216, + "task_loss": 3.3616738319396973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.496888520143238, + "compression/movement_sparsity/importance_threshold": -9.77013853093768e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9345048756490563, + "compression/movement_sparsity/model_sparsity": 0.9024017582972507, + "compression_loss": 160.0629119873047, + "distillation_loss": 6.021695137023926, + "epoch": 1.87, + "learning_rate": 4.062975486052409e-05, + "loss": 165.4394, + "step": 2217, + "task_loss": 2.9772891998291016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4969499291402717, + "compression/movement_sparsity/importance_threshold": -9.577312468842664e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9345621116537084, + "compression/movement_sparsity/model_sparsity": 0.9024570280690645, + "compression_loss": 160.068115234375, + "distillation_loss": 5.793888568878174, + "epoch": 1.87, + "learning_rate": 4.062552831783602e-05, + "loss": 166.2238, + "step": 2218, + "task_loss": 3.2094666957855225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4970105247850765, + "compression/movement_sparsity/importance_threshold": -9.387040356739242e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9346653272487645, + "compression/movement_sparsity/model_sparsity": 0.9025566978909021, + "compression_loss": 160.07327270507812, + "distillation_loss": 5.235283851623535, + "epoch": 1.88, + "learning_rate": 4.062130177514793e-05, + "loss": 166.5598, + "step": 2219, + "task_loss": 3.1168527603149414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4970703125, + "compression/movement_sparsity/importance_threshold": -9.199305168294813e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9347840204134118, + "compression/movement_sparsity/model_sparsity": 0.9026713135802009, + "compression_loss": 160.0782470703125, + "distillation_loss": 5.164148330688477, + "epoch": 1.88, + "learning_rate": 4.061707523245985e-05, + "loss": 165.6407, + "step": 2220, + "task_loss": 2.7088468074798584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4971292977073907, + "compression/movement_sparsity/importance_threshold": -9.014089877175044e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9348311804964117, + "compression/movement_sparsity/model_sparsity": 0.9027168535692683, + "compression_loss": 160.08314514160156, + "distillation_loss": 7.312452793121338, + "epoch": 1.88, + "learning_rate": 4.061284868977177e-05, + "loss": 166.1839, + "step": 2221, + "task_loss": 3.081338405609131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4971874858295968, + "compression/movement_sparsity/importance_threshold": -8.831377457047332e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9348431165882152, + "compression/movement_sparsity/model_sparsity": 0.9027283796195986, + "compression_loss": 160.08787536621094, + "distillation_loss": 6.593239784240723, + "epoch": 1.88, + "learning_rate": 4.060862214708368e-05, + "loss": 166.7792, + "step": 2222, + "task_loss": 4.395792484283447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4972448822889661, + "compression/movement_sparsity/importance_threshold": -8.65115088157821e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9348386092528488, + "compression/movement_sparsity/model_sparsity": 0.9027240271250683, + "compression_loss": 160.092529296875, + "distillation_loss": 5.105921745300293, + "epoch": 1.88, + "learning_rate": 4.060439560439561e-05, + "loss": 165.7791, + "step": 2223, + "task_loss": 3.8668746948242188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4973014925078474, + "compression/movement_sparsity/importance_threshold": -8.473393124434209e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9349293044718872, + "compression/movement_sparsity/model_sparsity": 0.9028116066843216, + "compression_loss": 160.0972442626953, + "distillation_loss": 6.887876987457275, + "epoch": 1.88, + "learning_rate": 4.060016906170753e-05, + "loss": 166.7268, + "step": 2224, + "task_loss": 4.878623008728027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4973573219085887, + "compression/movement_sparsity/importance_threshold": -8.298087159281861e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9349935041904387, + "compression/movement_sparsity/model_sparsity": 0.9028736009450394, + "compression_loss": 160.10179138183594, + "distillation_loss": 9.14314079284668, + "epoch": 1.88, + "learning_rate": 4.059594251901944e-05, + "loss": 167.1431, + "step": 2225, + "task_loss": 4.028992652893066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4974123759135378, + "compression/movement_sparsity/importance_threshold": -8.125215959789434e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9349920136694843, + "compression/movement_sparsity/model_sparsity": 0.902872161628065, + "compression_loss": 160.10635375976562, + "distillation_loss": 5.251765251159668, + "epoch": 1.88, + "learning_rate": 4.059171597633136e-05, + "loss": 165.9658, + "step": 2226, + "task_loss": 2.324880361557007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4974666599450435, + "compression/movement_sparsity/importance_threshold": -7.954762499621723e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9350739923219807, + "compression/movement_sparsity/model_sparsity": 0.9029513240616526, + "compression_loss": 160.11093139648438, + "distillation_loss": 7.090186595916748, + "epoch": 1.88, + "learning_rate": 4.058748943364328e-05, + "loss": 165.9931, + "step": 2227, + "task_loss": 2.8340048789978027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4975201794254536, + "compression/movement_sparsity/importance_threshold": -7.786709752446129e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9351306798149217, + "compression/movement_sparsity/model_sparsity": 0.9030060641648198, + "compression_loss": 160.11534118652344, + "distillation_loss": 5.584734916687012, + "epoch": 1.88, + "learning_rate": 4.05832628909552e-05, + "loss": 166.7349, + "step": 2228, + "task_loss": 3.7214515209198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4975729397771163, + "compression/movement_sparsity/importance_threshold": -7.621040691929183e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9351840166167569, + "compression/movement_sparsity/model_sparsity": 0.9030575686834288, + "compression_loss": 160.1196746826172, + "distillation_loss": 4.749370574951172, + "epoch": 1.88, + "learning_rate": 4.057903634826712e-05, + "loss": 166.1906, + "step": 2229, + "task_loss": 2.4553816318511963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4976249464223799, + "compression/movement_sparsity/importance_threshold": -7.457738291738285e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9352279094778245, + "compression/movement_sparsity/model_sparsity": 0.9030999536896885, + "compression_loss": 160.12380981445312, + "distillation_loss": 7.9535441398620605, + "epoch": 1.88, + "learning_rate": 4.057480980557904e-05, + "loss": 167.5517, + "step": 2230, + "task_loss": 3.4968388080596924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4976762047835925, + "compression/movement_sparsity/importance_threshold": -7.296785525539966e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9352907259929302, + "compression/movement_sparsity/model_sparsity": 0.9031606122642541, + "compression_loss": 160.1279296875, + "distillation_loss": 8.037897109985352, + "epoch": 1.89, + "learning_rate": 4.057058326289096e-05, + "loss": 166.6816, + "step": 2231, + "task_loss": 3.4865970611572266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4977267202831026, + "compression/movement_sparsity/importance_threshold": -7.138165366999891e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9353377072134155, + "compression/movement_sparsity/model_sparsity": 0.9032059795352846, + "compression_loss": 160.1321258544922, + "distillation_loss": 6.906966686248779, + "epoch": 1.89, + "learning_rate": 4.0566356720202873e-05, + "loss": 166.5066, + "step": 2232, + "task_loss": 2.8928592205047607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4977764983432582, + "compression/movement_sparsity/importance_threshold": -6.981860789786326e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9353911274844242, + "compression/movement_sparsity/model_sparsity": 0.9032575646556441, + "compression_loss": 160.13607788085938, + "distillation_loss": 6.219843864440918, + "epoch": 1.89, + "learning_rate": 4.056213017751479e-05, + "loss": 167.1917, + "step": 2233, + "task_loss": 2.7306981086730957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4978255443864072, + "compression/movement_sparsity/importance_threshold": -6.827854767565804e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9354974552872332, + "compression/movement_sparsity/model_sparsity": 0.9033602397713241, + "compression_loss": 160.13999938964844, + "distillation_loss": 6.1700849533081055, + "epoch": 1.89, + "learning_rate": 4.055790363482671e-05, + "loss": 166.186, + "step": 2234, + "task_loss": 2.377737522125244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4978738638348983, + "compression/movement_sparsity/importance_threshold": -6.676130274003121e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9355415031624801, + "compression/movement_sparsity/model_sparsity": 0.9034027744665492, + "compression_loss": 160.1438751220703, + "distillation_loss": 6.190776824951172, + "epoch": 1.89, + "learning_rate": 4.055367709213863e-05, + "loss": 166.3818, + "step": 2235, + "task_loss": 2.9691388607025146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4979214621110792, + "compression/movement_sparsity/importance_threshold": -6.526670282767412e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9356016009673648, + "compression/movement_sparsity/model_sparsity": 0.9034608077269536, + "compression_loss": 160.1475372314453, + "distillation_loss": 6.897368431091309, + "epoch": 1.89, + "learning_rate": 4.054945054945055e-05, + "loss": 166.3111, + "step": 2236, + "task_loss": 3.5071003437042236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4979683446372987, + "compression/movement_sparsity/importance_threshold": -6.379457767524341e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9357159418108251, + "compression/movement_sparsity/model_sparsity": 0.9035712206106875, + "compression_loss": 160.1512451171875, + "distillation_loss": 4.895448684692383, + "epoch": 1.89, + "learning_rate": 4.054522400676247e-05, + "loss": 166.3977, + "step": 2237, + "task_loss": 2.651609182357788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4980145168359045, + "compression/movement_sparsity/importance_threshold": -6.234475701939572e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9358109535785476, + "compression/movement_sparsity/model_sparsity": 0.9036629684318983, + "compression_loss": 160.15481567382812, + "distillation_loss": 5.931543350219727, + "epoch": 1.89, + "learning_rate": 4.0540997464074385e-05, + "loss": 166.6772, + "step": 2238, + "task_loss": 3.097256660461426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4980599841292448, + "compression/movement_sparsity/importance_threshold": -6.09170705968224e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.935832619791142, + "compression/movement_sparsity/model_sparsity": 0.903683890343437, + "compression_loss": 160.15835571289062, + "distillation_loss": 5.540727615356445, + "epoch": 1.89, + "learning_rate": 4.0536770921386305e-05, + "loss": 166.4461, + "step": 2239, + "task_loss": 3.2605767250061035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498104751939668, + "compression/movement_sparsity/importance_threshold": -5.951134814417142e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9358843945270169, + "compression/movement_sparsity/model_sparsity": 0.9037338864578569, + "compression_loss": 160.1618194580078, + "distillation_loss": 6.151486873626709, + "epoch": 1.89, + "learning_rate": 4.053254437869823e-05, + "loss": 167.0416, + "step": 2240, + "task_loss": 2.844891309738159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4981488256895221, + "compression/movement_sparsity/importance_threshold": -5.812741939811676e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9359477834021692, + "compression/movement_sparsity/model_sparsity": 0.9037950977301407, + "compression_loss": 160.16519165039062, + "distillation_loss": 7.432068347930908, + "epoch": 1.89, + "learning_rate": 4.0528317836010144e-05, + "loss": 166.8899, + "step": 2241, + "task_loss": 3.0972211360931396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4981922108011556, + "compression/movement_sparsity/importance_threshold": -5.676511409533243e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.936020520824748, + "compression/movement_sparsity/model_sparsity": 0.9038653363984874, + "compression_loss": 160.1685333251953, + "distillation_loss": 7.028159141540527, + "epoch": 1.89, + "learning_rate": 4.0524091293322064e-05, + "loss": 166.0256, + "step": 2242, + "task_loss": 3.8466532230377197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4982349126969166, + "compression/movement_sparsity/importance_threshold": -5.542426197247506e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9360695649262343, + "compression/movement_sparsity/model_sparsity": 0.9039126956842104, + "compression_loss": 160.17178344726562, + "distillation_loss": 8.252093315124512, + "epoch": 1.9, + "learning_rate": 4.0519864750633984e-05, + "loss": 166.6953, + "step": 2243, + "task_loss": 3.964700222015381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498276936799153, + "compression/movement_sparsity/importance_threshold": -5.410469276620998e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9360664050218107, + "compression/movement_sparsity/model_sparsity": 0.9039096443322248, + "compression_loss": 160.17498779296875, + "distillation_loss": 6.128657341003418, + "epoch": 1.9, + "learning_rate": 4.05156382079459e-05, + "loss": 167.3083, + "step": 2244, + "task_loss": 2.7759294509887695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4983182885302133, + "compression/movement_sparsity/importance_threshold": -5.280623621321118e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9360977298101901, + "compression/movement_sparsity/model_sparsity": 0.903939893017757, + "compression_loss": 160.17808532714844, + "distillation_loss": 7.01299524307251, + "epoch": 1.9, + "learning_rate": 4.051141166525782e-05, + "loss": 166.3831, + "step": 2245, + "task_loss": 3.278843641281128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4983589733124458, + "compression/movement_sparsity/importance_threshold": -5.152872205014397e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9361933735587974, + "compression/movement_sparsity/model_sparsity": 0.904032251109365, + "compression_loss": 160.18113708496094, + "distillation_loss": 5.842001914978027, + "epoch": 1.9, + "learning_rate": 4.050718512256974e-05, + "loss": 167.1998, + "step": 2246, + "task_loss": 3.185584545135498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498398996568198, + "compression/movement_sparsity/importance_threshold": -5.027198001367368e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9362010646469225, + "compression/movement_sparsity/model_sparsity": 0.9040396779849525, + "compression_loss": 160.1841278076172, + "distillation_loss": 4.510096073150635, + "epoch": 1.9, + "learning_rate": 4.050295857988166e-05, + "loss": 165.4819, + "step": 2247, + "task_loss": 2.5500590801239014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498438363719819, + "compression/movement_sparsity/importance_threshold": -4.90358398404743e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9362972569072411, + "compression/movement_sparsity/model_sparsity": 0.9041325657452071, + "compression_loss": 160.18699645996094, + "distillation_loss": 6.285117149353027, + "epoch": 1.9, + "learning_rate": 4.0498732037193576e-05, + "loss": 166.702, + "step": 2248, + "task_loss": 3.9513864517211914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4984770801896568, + "compression/movement_sparsity/importance_threshold": -4.782013126721114e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9363535866751529, + "compression/movement_sparsity/model_sparsity": 0.9041869604123005, + "compression_loss": 160.18984985351562, + "distillation_loss": 7.873725891113281, + "epoch": 1.9, + "learning_rate": 4.0494505494505496e-05, + "loss": 166.643, + "step": 2249, + "task_loss": 3.337759256362915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498515151400059, + "compression/movement_sparsity/importance_threshold": -4.662468403054086e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9363670252120785, + "compression/movement_sparsity/model_sparsity": 0.9041999372941409, + "compression_loss": 160.19252014160156, + "distillation_loss": 7.402373790740967, + "epoch": 1.9, + "learning_rate": 4.0490278951817415e-05, + "loss": 166.4185, + "step": 2250, + "task_loss": 4.273791313171387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4985525827733743, + "compression/movement_sparsity/importance_threshold": -4.544932786713744e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9364439718658327, + "compression/movement_sparsity/model_sparsity": 0.9042742405936232, + "compression_loss": 160.1952667236328, + "distillation_loss": 7.206536293029785, + "epoch": 1.9, + "learning_rate": 4.0486052409129335e-05, + "loss": 166.3253, + "step": 2251, + "task_loss": 2.7881734371185303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4985893797319507, + "compression/movement_sparsity/importance_threshold": -4.4293892513683555e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9365063591109035, + "compression/movement_sparsity/model_sparsity": 0.9043344846449002, + "compression_loss": 160.1979522705078, + "distillation_loss": 7.284627914428711, + "epoch": 1.9, + "learning_rate": 4.0481825866441255e-05, + "loss": 166.9501, + "step": 2252, + "task_loss": 2.7389354705810547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4986255476981365, + "compression/movement_sparsity/importance_threshold": -4.31582077068185e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9365434074997482, + "compression/movement_sparsity/model_sparsity": 0.9043702603076138, + "compression_loss": 160.2005157470703, + "distillation_loss": 6.506577014923096, + "epoch": 1.9, + "learning_rate": 4.0477599323753174e-05, + "loss": 165.6199, + "step": 2253, + "task_loss": 3.5874550342559814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.49866109209428, + "compression/movement_sparsity/importance_threshold": -4.204210318323361e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9366264474031644, + "compression/movement_sparsity/model_sparsity": 0.904450447534887, + "compression_loss": 160.2029571533203, + "distillation_loss": 5.827661514282227, + "epoch": 1.9, + "learning_rate": 4.047337278106509e-05, + "loss": 165.3543, + "step": 2254, + "task_loss": 2.079104423522949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4986960183427291, + "compression/movement_sparsity/importance_threshold": -4.0945408679576856e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9366830275785964, + "compression/movement_sparsity/model_sparsity": 0.9045050840072321, + "compression_loss": 160.20545959472656, + "distillation_loss": 8.142450332641602, + "epoch": 1.91, + "learning_rate": 4.046914623837701e-05, + "loss": 167.1443, + "step": 2255, + "task_loss": 4.10180139541626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4987303318658323, + "compression/movement_sparsity/importance_threshold": -3.986795393252224e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9367423503125849, + "compression/movement_sparsity/model_sparsity": 0.9045623688228099, + "compression_loss": 160.20797729492188, + "distillation_loss": 5.617791175842285, + "epoch": 1.91, + "learning_rate": 4.046491969568893e-05, + "loss": 166.0464, + "step": 2256, + "task_loss": 3.0598959922790527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4987640380859375, + "compression/movement_sparsity/importance_threshold": -3.8809568678743744e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9367532132293012, + "compression/movement_sparsity/model_sparsity": 0.9045728585649188, + "compression_loss": 160.21031188964844, + "distillation_loss": 6.853257179260254, + "epoch": 1.91, + "learning_rate": 4.046069315300085e-05, + "loss": 166.9046, + "step": 2257, + "task_loss": 2.676332712173462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4987971424253932, + "compression/movement_sparsity/importance_threshold": -3.777008265489802e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9368125836599601, + "compression/movement_sparsity/model_sparsity": 0.9046301894386398, + "compression_loss": 160.21267700195312, + "distillation_loss": 6.59063720703125, + "epoch": 1.91, + "learning_rate": 4.0456466610312766e-05, + "loss": 166.0765, + "step": 2258, + "task_loss": 2.8540420532226562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4988296503065472, + "compression/movement_sparsity/importance_threshold": -3.6749325597667734e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9368201078097383, + "compression/movement_sparsity/model_sparsity": 0.9046374551107261, + "compression_loss": 160.2148895263672, + "distillation_loss": 5.941023349761963, + "epoch": 1.91, + "learning_rate": 4.0452240067624686e-05, + "loss": 165.4657, + "step": 2259, + "task_loss": 2.8525338172912598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4988615671517482, + "compression/movement_sparsity/importance_threshold": -3.5747127243700855e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9368015538048969, + "compression/movement_sparsity/model_sparsity": 0.9046195384930298, + "compression_loss": 160.2171173095703, + "distillation_loss": 6.166594505310059, + "epoch": 1.91, + "learning_rate": 4.0448013524936606e-05, + "loss": 166.8199, + "step": 2260, + "task_loss": 3.9337363243103027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498892898383344, + "compression/movement_sparsity/importance_threshold": -3.476331732968005e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9368916885880565, + "compression/movement_sparsity/model_sparsity": 0.9047065768691007, + "compression_loss": 160.21925354003906, + "distillation_loss": 5.930685043334961, + "epoch": 1.91, + "learning_rate": 4.044378698224852e-05, + "loss": 166.1881, + "step": 2261, + "task_loss": 3.537510871887207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498923649423683, + "compression/movement_sparsity/importance_threshold": -3.3797725592270636e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9368938349382309, + "compression/movement_sparsity/model_sparsity": 0.9047086494855437, + "compression_loss": 160.22125244140625, + "distillation_loss": 6.61134672164917, + "epoch": 1.91, + "learning_rate": 4.0439560439560445e-05, + "loss": 166.3565, + "step": 2262, + "task_loss": 3.2768218517303467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4989538256951132, + "compression/movement_sparsity/importance_threshold": -3.285018176812926e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9369106122420946, + "compression/movement_sparsity/model_sparsity": 0.9047248504374067, + "compression_loss": 160.2233428955078, + "distillation_loss": 7.624445915222168, + "epoch": 1.91, + "learning_rate": 4.0435333896872365e-05, + "loss": 166.6674, + "step": 2263, + "task_loss": 3.4444010257720947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4989834326199831, + "compression/movement_sparsity/importance_threshold": -3.1920515593929916e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.936960371793639, + "compression/movement_sparsity/model_sparsity": 0.9047729005952774, + "compression_loss": 160.22523498535156, + "distillation_loss": 8.485793113708496, + "epoch": 1.91, + "learning_rate": 4.043110735418428e-05, + "loss": 167.0411, + "step": 2264, + "task_loss": 4.363717555999756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4990124756206404, + "compression/movement_sparsity/importance_threshold": -3.1008556806346593e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9369796054760356, + "compression/movement_sparsity/model_sparsity": 0.9047914735415139, + "compression_loss": 160.22708129882812, + "distillation_loss": 5.798768043518066, + "epoch": 1.91, + "learning_rate": 4.04268808114962e-05, + "loss": 166.468, + "step": 2265, + "task_loss": 3.9484541416168213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499040960119434, + "compression/movement_sparsity/importance_threshold": -3.0114135142027265e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9370010332052773, + "compression/movement_sparsity/model_sparsity": 0.9048121651623368, + "compression_loss": 160.22891235351562, + "distillation_loss": 6.1181488037109375, + "epoch": 1.91, + "learning_rate": 4.042265426880812e-05, + "loss": 166.3589, + "step": 2266, + "task_loss": 3.781599521636963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4990688915387116, + "compression/movement_sparsity/importance_threshold": -2.923708033766327e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9370265389998504, + "compression/movement_sparsity/model_sparsity": 0.9048367947544013, + "compression_loss": 160.23072814941406, + "distillation_loss": 5.796014785766602, + "epoch": 1.92, + "learning_rate": 4.041842772612004e-05, + "loss": 165.7493, + "step": 2267, + "task_loss": 1.9990835189819336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4990962753008215, + "compression/movement_sparsity/importance_threshold": -2.8377222129902582e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9371044157386802, + "compression/movement_sparsity/model_sparsity": 0.9049119961876754, + "compression_loss": 160.2323455810547, + "distillation_loss": 6.07520866394043, + "epoch": 1.92, + "learning_rate": 4.041420118343196e-05, + "loss": 166.2371, + "step": 2268, + "task_loss": 2.0306954383850098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4991231168281116, + "compression/movement_sparsity/importance_threshold": -2.7534390255427865e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9371220158101108, + "compression/movement_sparsity/model_sparsity": 0.9049289916425081, + "compression_loss": 160.23385620117188, + "distillation_loss": 5.878981590270996, + "epoch": 1.92, + "learning_rate": 4.040997464074388e-05, + "loss": 166.614, + "step": 2269, + "task_loss": 2.6395576000213623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4991494215429304, + "compression/movement_sparsity/importance_threshold": -2.670841445088709e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9371451129228214, + "compression/movement_sparsity/model_sparsity": 0.9049512952983422, + "compression_loss": 160.2352752685547, + "distillation_loss": 6.553890228271484, + "epoch": 1.92, + "learning_rate": 4.040574809805579e-05, + "loss": 165.7844, + "step": 2270, + "task_loss": 3.7949657440185547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4991751948676264, + "compression/movement_sparsity/importance_threshold": -2.5899124452962927e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.937151385034998, + "compression/movement_sparsity/model_sparsity": 0.90495735194417, + "compression_loss": 160.23672485351562, + "distillation_loss": 6.890753269195557, + "epoch": 1.92, + "learning_rate": 4.040152155536771e-05, + "loss": 165.6783, + "step": 2271, + "task_loss": 2.9423232078552246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4992004422245473, + "compression/movement_sparsity/importance_threshold": -2.510634999832069e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9372227511782986, + "compression/movement_sparsity/model_sparsity": 0.9050262664409005, + "compression_loss": 160.2380828857422, + "distillation_loss": 8.312797546386719, + "epoch": 1.92, + "learning_rate": 4.039729501267963e-05, + "loss": 166.6976, + "step": 2272, + "task_loss": 3.2233598232269287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4992251690360416, + "compression/movement_sparsity/importance_threshold": -2.43299208236257e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9372347945876107, + "compression/movement_sparsity/model_sparsity": 0.9050378961220529, + "compression_loss": 160.23939514160156, + "distillation_loss": 8.048696517944336, + "epoch": 1.92, + "learning_rate": 4.039306846999155e-05, + "loss": 166.8033, + "step": 2273, + "task_loss": 3.3677070140838623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4992493807244573, + "compression/movement_sparsity/importance_threshold": -2.3569666665543276e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9372657497267934, + "compression/movement_sparsity/model_sparsity": 0.9050677878569755, + "compression_loss": 160.24057006835938, + "distillation_loss": 7.093265533447266, + "epoch": 1.92, + "learning_rate": 4.038884192730347e-05, + "loss": 167.0771, + "step": 2274, + "task_loss": 3.1502764225006104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4992730827121428, + "compression/movement_sparsity/importance_threshold": -2.2825417260738737e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9373358042116542, + "compression/movement_sparsity/model_sparsity": 0.9051354357547684, + "compression_loss": 160.2418212890625, + "distillation_loss": 7.007755279541016, + "epoch": 1.92, + "learning_rate": 4.038461538461539e-05, + "loss": 166.8328, + "step": 2275, + "task_loss": 3.3897366523742676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4992962804214458, + "compression/movement_sparsity/importance_threshold": -2.2097002345894748e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.937413907509669, + "compression/movement_sparsity/model_sparsity": 0.9052108559642227, + "compression_loss": 160.2429962158203, + "distillation_loss": 6.155128002166748, + "epoch": 1.92, + "learning_rate": 4.038038884192731e-05, + "loss": 166.252, + "step": 2276, + "task_loss": 3.0687942504882812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4993189792747152, + "compression/movement_sparsity/importance_threshold": -2.1384251657650608e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9375080249648189, + "compression/movement_sparsity/model_sparsity": 0.905301740195249, + "compression_loss": 160.24404907226562, + "distillation_loss": 6.239344596862793, + "epoch": 1.92, + "learning_rate": 4.037616229923922e-05, + "loss": 166.59, + "step": 2277, + "task_loss": 2.195801258087158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4993411846942988, + "compression/movement_sparsity/importance_threshold": -2.0686994932697655e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9375764100662106, + "compression/movement_sparsity/model_sparsity": 0.9053677760580306, + "compression_loss": 160.24514770507812, + "distillation_loss": 6.321161270141602, + "epoch": 1.93, + "learning_rate": 4.037193575655114e-05, + "loss": 166.8741, + "step": 2278, + "task_loss": 2.8800103664398193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4993629021025447, + "compression/movement_sparsity/importance_threshold": -2.0005061907692537e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9376463333852273, + "compression/movement_sparsity/model_sparsity": 0.9054352972959299, + "compression_loss": 160.2462158203125, + "distillation_loss": 6.892311096191406, + "epoch": 1.93, + "learning_rate": 4.036770921386307e-05, + "loss": 166.2419, + "step": 2279, + "task_loss": 3.235488176345825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4993841369218013, + "compression/movement_sparsity/importance_threshold": -1.9338282319309244e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.937731841591344, + "compression/movement_sparsity/model_sparsity": 0.9055178680321125, + "compression_loss": 160.2470245361328, + "distillation_loss": 6.462249755859375, + "epoch": 1.93, + "learning_rate": 4.036348267117498e-05, + "loss": 167.0156, + "step": 2280, + "task_loss": 2.772505760192871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4994048945744167, + "compression/movement_sparsity/importance_threshold": -1.8686485904213096e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9378695538033707, + "compression/movement_sparsity/model_sparsity": 0.9056508494060037, + "compression_loss": 160.2480010986328, + "distillation_loss": 5.56057596206665, + "epoch": 1.93, + "learning_rate": 4.03592561284869e-05, + "loss": 166.0986, + "step": 2281, + "task_loss": 3.266045331954956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4994251804827392, + "compression/movement_sparsity/importance_threshold": -1.8049502399060738e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9378897414191781, + "compression/movement_sparsity/model_sparsity": 0.9056703435151038, + "compression_loss": 160.24879455566406, + "distillation_loss": 6.473270893096924, + "epoch": 1.93, + "learning_rate": 4.035502958579882e-05, + "loss": 166.6945, + "step": 2282, + "task_loss": 2.9187378883361816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4994450000691169, + "compression/movement_sparsity/importance_threshold": -1.7427161540526162e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9379320960626207, + "compression/movement_sparsity/model_sparsity": 0.9057112431462461, + "compression_loss": 160.2496337890625, + "distillation_loss": 6.201051712036133, + "epoch": 1.93, + "learning_rate": 4.035080304311073e-05, + "loss": 166.3391, + "step": 2283, + "task_loss": 3.883798837661743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499464358755898, + "compression/movement_sparsity/importance_threshold": -1.6819293065283361e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9379536788060416, + "compression/movement_sparsity/model_sparsity": 0.9057320844560341, + "compression_loss": 160.25018310546875, + "distillation_loss": 6.144804954528809, + "epoch": 1.93, + "learning_rate": 4.034657650042266e-05, + "loss": 166.4444, + "step": 2284, + "task_loss": 2.8304660320281982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4994832619654308, + "compression/movement_sparsity/importance_threshold": -1.622572670998898e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9380174373303907, + "compression/movement_sparsity/model_sparsity": 0.9057936526789275, + "compression_loss": 160.25088500976562, + "distillation_loss": 6.457509517669678, + "epoch": 1.93, + "learning_rate": 4.034234995773458e-05, + "loss": 166.3994, + "step": 2285, + "task_loss": 3.257936716079712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4995017151200631, + "compression/movement_sparsity/importance_threshold": -1.5646292211325685e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9381504633445363, + "compression/movement_sparsity/model_sparsity": 0.9059221088402515, + "compression_loss": 160.2515106201172, + "distillation_loss": 7.1787238121032715, + "epoch": 1.93, + "learning_rate": 4.033812341504649e-05, + "loss": 167.0358, + "step": 2286, + "task_loss": 3.381146192550659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4995197236421438, + "compression/movement_sparsity/importance_threshold": -1.5080819305941448e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9381933664996902, + "compression/movement_sparsity/model_sparsity": 0.9059635381400403, + "compression_loss": 160.25198364257812, + "distillation_loss": 7.339473724365234, + "epoch": 1.93, + "learning_rate": 4.033389687235841e-05, + "loss": 166.7057, + "step": 2287, + "task_loss": 3.6668620109558105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4995372929540205, + "compression/movement_sparsity/importance_threshold": -1.452913773051026e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9383022937710438, + "compression/movement_sparsity/model_sparsity": 0.9060687234245234, + "compression_loss": 160.25244140625, + "distillation_loss": 6.810704708099365, + "epoch": 1.93, + "learning_rate": 4.032967032967033e-05, + "loss": 166.5957, + "step": 2288, + "task_loss": 2.7721328735351562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4995544284780418, + "compression/movement_sparsity/importance_threshold": -1.3991077221706116e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9383476652288982, + "compression/movement_sparsity/model_sparsity": 0.9061125362332216, + "compression_loss": 160.25282287597656, + "distillation_loss": 6.215928077697754, + "epoch": 1.93, + "learning_rate": 4.032544378698225e-05, + "loss": 166.6523, + "step": 2289, + "task_loss": 2.639528512954712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4995711356365553, + "compression/movement_sparsity/importance_threshold": -1.3466467516194333e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9383736479901768, + "compression/movement_sparsity/model_sparsity": 0.9061376264067179, + "compression_loss": 160.25323486328125, + "distillation_loss": 6.250784873962402, + "epoch": 1.94, + "learning_rate": 4.032121724429417e-05, + "loss": 166.8524, + "step": 2290, + "task_loss": 2.1177682876586914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4995874198519097, + "compression/movement_sparsity/importance_threshold": -1.2955138350631556e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9384423669682622, + "compression/movement_sparsity/model_sparsity": 0.9062039846765019, + "compression_loss": 160.2536163330078, + "distillation_loss": 5.532809257507324, + "epoch": 1.94, + "learning_rate": 4.031699070160609e-05, + "loss": 165.9842, + "step": 2291, + "task_loss": 4.4165496826171875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4996032865464533, + "compression/movement_sparsity/importance_threshold": -1.2456919461691779e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9385329906422948, + "compression/movement_sparsity/model_sparsity": 0.9062914951485403, + "compression_loss": 160.25393676757812, + "distillation_loss": 6.652081489562988, + "epoch": 1.94, + "learning_rate": 4.031276415891801e-05, + "loss": 166.7904, + "step": 2292, + "task_loss": 4.348559379577637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499618741142534, + "compression/movement_sparsity/importance_threshold": -1.1971640586057666e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9386001356302524, + "compression/movement_sparsity/model_sparsity": 0.9063563334995994, + "compression_loss": 160.25424194335938, + "distillation_loss": 6.511976718902588, + "epoch": 1.94, + "learning_rate": 4.030853761622992e-05, + "loss": 166.1173, + "step": 2293, + "task_loss": 3.1712534427642822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4996337890625, + "compression/movement_sparsity/importance_threshold": -1.1499131460368517e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9385707783295328, + "compression/movement_sparsity/model_sparsity": 0.9063279847124732, + "compression_loss": 160.25454711914062, + "distillation_loss": 7.2438225746154785, + "epoch": 1.94, + "learning_rate": 4.030431107354184e-05, + "loss": 166.7937, + "step": 2294, + "task_loss": 3.1915442943573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4996484357286994, + "compression/movement_sparsity/importance_threshold": -1.1039221821306996e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9386646096046595, + "compression/movement_sparsity/model_sparsity": 0.9064185925946405, + "compression_loss": 160.2546844482422, + "distillation_loss": 5.350149154663086, + "epoch": 1.94, + "learning_rate": 4.030008453085376e-05, + "loss": 166.2937, + "step": 2295, + "task_loss": 2.411289691925049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499662686563481, + "compression/movement_sparsity/importance_threshold": -1.0591741405538424e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9386838552112238, + "compression/movement_sparsity/model_sparsity": 0.9064371770554128, + "compression_loss": 160.25479125976562, + "distillation_loss": 4.8888044357299805, + "epoch": 1.94, + "learning_rate": 4.029585798816568e-05, + "loss": 165.8791, + "step": 2296, + "task_loss": 3.549776554107666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4996765469891922, + "compression/movement_sparsity/importance_threshold": -1.0156519949736792e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9387504636116377, + "compression/movement_sparsity/model_sparsity": 0.9065014972523612, + "compression_loss": 160.25489807128906, + "distillation_loss": 5.807749271392822, + "epoch": 1.94, + "learning_rate": 4.02916314454776e-05, + "loss": 166.0598, + "step": 2297, + "task_loss": 3.165010452270508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4996900224281817, + "compression/movement_sparsity/importance_threshold": -9.733387190558745e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9388193972247406, + "compression/movement_sparsity/model_sparsity": 0.9065680627837894, + "compression_loss": 160.2548828125, + "distillation_loss": 7.553060531616211, + "epoch": 1.94, + "learning_rate": 4.028740490278952e-05, + "loss": 167.4046, + "step": 2298, + "task_loss": 4.012598514556885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4997031183027976, + "compression/movement_sparsity/importance_threshold": -9.322172864669603e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9388033353709351, + "compression/movement_sparsity/model_sparsity": 0.9065525527040742, + "compression_loss": 160.2548065185547, + "distillation_loss": 5.544658184051514, + "epoch": 1.94, + "learning_rate": 4.0283178360101435e-05, + "loss": 166.6705, + "step": 2299, + "task_loss": 2.5536773204803467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4997158400353878, + "compression/movement_sparsity/importance_threshold": -8.922706708752032e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9388191706655555, + "compression/movement_sparsity/model_sparsity": 0.9065678440076093, + "compression_loss": 160.25482177734375, + "distillation_loss": 6.991927623748779, + "epoch": 1.94, + "learning_rate": 4.0278951817413355e-05, + "loss": 167.0597, + "step": 2300, + "task_loss": 3.9764349460601807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499728193048301, + "compression/movement_sparsity/importance_threshold": -8.534818459454002e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9389276090460361, + "compression/movement_sparsity/model_sparsity": 0.9066725571961248, + "compression_loss": 160.25469970703125, + "distillation_loss": 9.4048433303833, + "epoch": 1.94, + "learning_rate": 4.027472527472528e-05, + "loss": 167.3636, + "step": 2301, + "task_loss": 3.833726406097412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499740182763885, + "compression/movement_sparsity/importance_threshold": -8.158337853458181e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9389631549897586, + "compression/movement_sparsity/model_sparsity": 0.9067068820273284, + "compression_loss": 160.2545928955078, + "distillation_loss": 5.585280895233154, + "epoch": 1.95, + "learning_rate": 4.0270498732037194e-05, + "loss": 165.9951, + "step": 2302, + "task_loss": 2.1264007091522217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4997518146044881, + "compression/movement_sparsity/importance_threshold": -7.793094627421213e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9390370013599275, + "compression/movement_sparsity/model_sparsity": 0.9067781915475039, + "compression_loss": 160.25433349609375, + "distillation_loss": 5.731496810913086, + "epoch": 1.95, + "learning_rate": 4.0266272189349114e-05, + "loss": 166.0074, + "step": 2303, + "task_loss": 3.065538167953491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4997630939924584, + "compression/movement_sparsity/importance_threshold": -7.438918518025764e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.939011376323678, + "compression/movement_sparsity/model_sparsity": 0.9067534468100815, + "compression_loss": 160.25421142578125, + "distillation_loss": 4.953388690948486, + "epoch": 1.95, + "learning_rate": 4.0262045646661034e-05, + "loss": 166.1986, + "step": 2304, + "task_loss": 3.1932075023651123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4997740263501445, + "compression/movement_sparsity/importance_threshold": -7.095639261919806e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9389827583213519, + "compression/movement_sparsity/model_sparsity": 0.9067258119241746, + "compression_loss": 160.25387573242188, + "distillation_loss": 5.227053642272949, + "epoch": 1.95, + "learning_rate": 4.0257819103972953e-05, + "loss": 165.0266, + "step": 2305, + "task_loss": 2.610239028930664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499784617099894, + "compression/movement_sparsity/importance_threshold": -6.763086595777332e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9390099334993941, + "compression/movement_sparsity/model_sparsity": 0.9067520535512503, + "compression_loss": 160.2535858154297, + "distillation_loss": 8.27192497253418, + "epoch": 1.95, + "learning_rate": 4.0253592561284866e-05, + "loss": 166.5678, + "step": 2306, + "task_loss": 3.0572874546051025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4997948716640557, + "compression/movement_sparsity/importance_threshold": -6.44109025626366e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9390175768908486, + "compression/movement_sparsity/model_sparsity": 0.9067594343686947, + "compression_loss": 160.25323486328125, + "distillation_loss": 6.735799789428711, + "epoch": 1.95, + "learning_rate": 4.024936601859679e-05, + "loss": 166.4424, + "step": 2307, + "task_loss": 3.5349838733673096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998047954649776, + "compression/movement_sparsity/importance_threshold": -6.129479980052782e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9390480669874935, + "compression/movement_sparsity/model_sparsity": 0.9067888770367213, + "compression_loss": 160.2529296875, + "distillation_loss": 7.471296310424805, + "epoch": 1.95, + "learning_rate": 4.024513947590871e-05, + "loss": 166.3025, + "step": 2308, + "task_loss": 3.3933815956115723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998143939250075, + "compression/movement_sparsity/importance_threshold": -5.828085503818692e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9391163566955442, + "compression/movement_sparsity/model_sparsity": 0.9068548207832167, + "compression_loss": 160.25267028808594, + "distillation_loss": 7.8446125984191895, + "epoch": 1.95, + "learning_rate": 4.0240912933220626e-05, + "loss": 166.5962, + "step": 2309, + "task_loss": 3.4806113243103027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499823672466494, + "compression/movement_sparsity/importance_threshold": -5.53673656420936e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9391452131812229, + "compression/movement_sparsity/model_sparsity": 0.9068826859598395, + "compression_loss": 160.252197265625, + "distillation_loss": 5.635642051696777, + "epoch": 1.95, + "learning_rate": 4.0236686390532545e-05, + "loss": 166.0683, + "step": 2310, + "task_loss": 3.0456361770629883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499832636511785, + "compression/movement_sparsity/importance_threshold": -5.255262897907453e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9392000047315097, + "compression/movement_sparsity/model_sparsity": 0.9069355952518153, + "compression_loss": 160.2517852783203, + "distillation_loss": 8.03728199005127, + "epoch": 1.95, + "learning_rate": 4.0232459847844465e-05, + "loss": 166.6682, + "step": 2311, + "task_loss": 3.352954626083374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499841291483229, + "compression/movement_sparsity/importance_threshold": -4.983494241569617e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9392522325857547, + "compression/movement_sparsity/model_sparsity": 0.9069860289185955, + "compression_loss": 160.2512664794922, + "distillation_loss": 8.243897438049316, + "epoch": 1.95, + "learning_rate": 4.0228233305156385e-05, + "loss": 166.4311, + "step": 2312, + "task_loss": 4.332007884979248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499849642803174, + "compression/movement_sparsity/importance_threshold": -4.7212603318611684e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9393562113275395, + "compression/movement_sparsity/model_sparsity": 0.9070864356707239, + "compression_loss": 160.2508087158203, + "distillation_loss": 6.26651668548584, + "epoch": 1.95, + "learning_rate": 4.0224006762468305e-05, + "loss": 166.9993, + "step": 2313, + "task_loss": 3.6233510971069336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998576958939684, + "compression/movement_sparsity/importance_threshold": -4.468390905464775e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9394542637580092, + "compression/movement_sparsity/model_sparsity": 0.9071811196985623, + "compression_loss": 160.2501983642578, + "distillation_loss": 6.953649044036865, + "epoch": 1.96, + "learning_rate": 4.0219780219780224e-05, + "loss": 166.9118, + "step": 2314, + "task_loss": 3.4671061038970947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998654561779603, + "compression/movement_sparsity/importance_threshold": -4.224715699037082e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.939546616436349, + "compression/movement_sparsity/model_sparsity": 0.9072702997782911, + "compression_loss": 160.24966430664062, + "distillation_loss": 7.9676690101623535, + "epoch": 1.96, + "learning_rate": 4.021555367709214e-05, + "loss": 166.4533, + "step": 2315, + "task_loss": 3.0737318992614746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499872929077498, + "compression/movement_sparsity/importance_threshold": -3.990064449243408e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9395063008255722, + "compression/movement_sparsity/model_sparsity": 0.9072313691327697, + "compression_loss": 160.2492218017578, + "distillation_loss": 6.623746871948242, + "epoch": 1.96, + "learning_rate": 4.021132713440406e-05, + "loss": 166.2768, + "step": 2316, + "task_loss": 3.9193062782287598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998801200149292, + "compression/movement_sparsity/importance_threshold": -3.764266892757745e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9395257372188186, + "compression/movement_sparsity/model_sparsity": 0.9072501378261149, + "compression_loss": 160.24867248535156, + "distillation_loss": 5.222832202911377, + "epoch": 1.96, + "learning_rate": 4.020710059171598e-05, + "loss": 166.5427, + "step": 2317, + "task_loss": 2.873854875564575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998870344126027, + "compression/movement_sparsity/importance_threshold": -3.5471527662367386e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.939577070760491, + "compression/movement_sparsity/model_sparsity": 0.9072997079027103, + "compression_loss": 160.24815368652344, + "distillation_loss": 7.107813835144043, + "epoch": 1.96, + "learning_rate": 4.0202874049027896e-05, + "loss": 166.4333, + "step": 2318, + "task_loss": 3.1081695556640625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998936776928664, + "compression/movement_sparsity/importance_threshold": -3.338551806363055e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9396959785601559, + "compression/movement_sparsity/model_sparsity": 0.9074145308536535, + "compression_loss": 160.24745178222656, + "distillation_loss": 6.21928596496582, + "epoch": 1.96, + "learning_rate": 4.0198647506339816e-05, + "loss": 166.0708, + "step": 2319, + "task_loss": 2.2859959602355957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999000552780686, + "compression/movement_sparsity/importance_threshold": -3.138293749793339e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.939757209160966, + "compression/movement_sparsity/model_sparsity": 0.9074736579949585, + "compression_loss": 160.2467803955078, + "distillation_loss": 7.165808200836182, + "epoch": 1.96, + "learning_rate": 4.0194420963651736e-05, + "loss": 167.3753, + "step": 2320, + "task_loss": 4.1401214599609375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999061725905571, + "compression/movement_sparsity/importance_threshold": -2.9462083331929095e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9397843724148405, + "compression/movement_sparsity/model_sparsity": 0.9074998881074984, + "compression_loss": 160.2461700439453, + "distillation_loss": 6.096712112426758, + "epoch": 1.96, + "learning_rate": 4.0190194420963656e-05, + "loss": 166.5108, + "step": 2321, + "task_loss": 3.754952907562256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499912035052681, + "compression/movement_sparsity/importance_threshold": -2.7621252932357593e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9397642086473682, + "compression/movement_sparsity/model_sparsity": 0.9074804170274698, + "compression_loss": 160.2455291748047, + "distillation_loss": 6.692794322967529, + "epoch": 1.96, + "learning_rate": 4.018596787827557e-05, + "loss": 166.4531, + "step": 2322, + "task_loss": 3.667102098464966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999176480867873, + "compression/movement_sparsity/importance_threshold": -2.585874366587207e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.939793196298891, + "compression/movement_sparsity/model_sparsity": 0.9075084088639864, + "compression_loss": 160.24473571777344, + "distillation_loss": 6.191159725189209, + "epoch": 1.96, + "learning_rate": 4.018174133558749e-05, + "loss": 166.1407, + "step": 2323, + "task_loss": 2.7325778007507324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999230171152251, + "compression/movement_sparsity/importance_threshold": -2.4172852899125713e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9398451022006099, + "compression/movement_sparsity/model_sparsity": 0.9075585316383, + "compression_loss": 160.2440185546875, + "distillation_loss": 7.136834621429443, + "epoch": 1.96, + "learning_rate": 4.0177514792899415e-05, + "loss": 166.5416, + "step": 2324, + "task_loss": 3.2274491786956787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999281475603423, + "compression/movement_sparsity/importance_threshold": -2.2561877998858448e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9399011219401632, + "compression/movement_sparsity/model_sparsity": 0.9076126269274628, + "compression_loss": 160.2433624267578, + "distillation_loss": 6.396122455596924, + "epoch": 1.96, + "learning_rate": 4.017328825021133e-05, + "loss": 167.1078, + "step": 2325, + "task_loss": 3.020205497741699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999330448444872, + "compression/movement_sparsity/importance_threshold": -2.1024116331636727e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9399558061729413, + "compression/movement_sparsity/model_sparsity": 0.9076654325886165, + "compression_loss": 160.2424774169922, + "distillation_loss": 6.5041184425354, + "epoch": 1.97, + "learning_rate": 4.016906170752325e-05, + "loss": 166.9327, + "step": 2326, + "task_loss": 3.6526072025299072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499937714390008, + "compression/movement_sparsity/importance_threshold": -1.9557865264200475e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9400235712176158, + "compression/movement_sparsity/model_sparsity": 0.907730869695537, + "compression_loss": 160.24172973632812, + "distillation_loss": 6.3362016677856445, + "epoch": 1.97, + "learning_rate": 4.016483516483517e-05, + "loss": 166.1995, + "step": 2327, + "task_loss": 3.876760721206665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999421616192525, + "compression/movement_sparsity/importance_threshold": -1.8161422163116142e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9400382975646462, + "compression/movement_sparsity/model_sparsity": 0.9077450901472431, + "compression_loss": 160.24099731445312, + "distillation_loss": 5.346047878265381, + "epoch": 1.97, + "learning_rate": 4.016060862214708e-05, + "loss": 165.7456, + "step": 2328, + "task_loss": 1.9838918447494507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999463919545695, + "compression/movement_sparsity/importance_threshold": -1.683308439521039e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9401107845797045, + "compression/movement_sparsity/model_sparsity": 0.9078150870103382, + "compression_loss": 160.2401580810547, + "distillation_loss": 5.978574752807617, + "epoch": 1.97, + "learning_rate": 4.015638207945901e-05, + "loss": 166.1061, + "step": 2329, + "task_loss": 3.0958638191223145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999504108183066, + "compression/movement_sparsity/importance_threshold": -1.5571149327136408e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9401909030620499, + "compression/movement_sparsity/model_sparsity": 0.9078924531763417, + "compression_loss": 160.2393035888672, + "distillation_loss": 6.485856056213379, + "epoch": 1.97, + "learning_rate": 4.0152155536770927e-05, + "loss": 167.1001, + "step": 2330, + "task_loss": 3.1669461727142334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999542236328125, + "compression/movement_sparsity/importance_threshold": -1.4373914325460646e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9402461000340363, + "compression/movement_sparsity/model_sparsity": 0.9079457539625346, + "compression_loss": 160.23838806152344, + "distillation_loss": 6.434739112854004, + "epoch": 1.97, + "learning_rate": 4.014792899408284e-05, + "loss": 166.3486, + "step": 2331, + "task_loss": 3.4682846069335938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999578358204353, + "compression/movement_sparsity/importance_threshold": -1.323967675692303e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9402534095487971, + "compression/movement_sparsity/model_sparsity": 0.9079528123729766, + "compression_loss": 160.23739624023438, + "distillation_loss": 5.510326862335205, + "epoch": 1.97, + "learning_rate": 4.014370245139476e-05, + "loss": 165.9706, + "step": 2332, + "task_loss": 2.847395420074463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999612528035227, + "compression/movement_sparsity/importance_threshold": -1.2166733988176748e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9402802150776426, + "compression/movement_sparsity/model_sparsity": 0.9079786970494428, + "compression_loss": 160.23663330078125, + "distillation_loss": 5.170238494873047, + "epoch": 1.97, + "learning_rate": 4.013947590870668e-05, + "loss": 166.3467, + "step": 2333, + "task_loss": 3.0937933921813965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999644800044236, + "compression/movement_sparsity/importance_threshold": -1.1153383385961724e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9403256342321675, + "compression/movement_sparsity/model_sparsity": 0.9080225559162842, + "compression_loss": 160.23556518554688, + "distillation_loss": 7.272273063659668, + "epoch": 1.97, + "learning_rate": 4.01352493660186e-05, + "loss": 166.3301, + "step": 2334, + "task_loss": 3.949800491333008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999675228454856, + "compression/movement_sparsity/importance_threshold": -1.019792231684441e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9403562077979859, + "compression/movement_sparsity/model_sparsity": 0.9080520791860615, + "compression_loss": 160.23471069335938, + "distillation_loss": 4.398719310760498, + "epoch": 1.97, + "learning_rate": 4.013102282333052e-05, + "loss": 165.8517, + "step": 2335, + "task_loss": 2.015611171722412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999703867490572, + "compression/movement_sparsity/importance_threshold": -9.298648147564731e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.940412132144198, + "compression/movement_sparsity/model_sparsity": 0.9081060823589379, + "compression_loss": 160.23365783691406, + "distillation_loss": 6.767613410949707, + "epoch": 1.97, + "learning_rate": 4.012679628064244e-05, + "loss": 166.5842, + "step": 2336, + "task_loss": 3.376575469970703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999730771374868, + "compression/movement_sparsity/importance_threshold": -8.453858244689139e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9404556672802367, + "compression/movement_sparsity/model_sparsity": 0.9081481219291236, + "compression_loss": 160.23275756835938, + "distillation_loss": 6.922449111938477, + "epoch": 1.97, + "learning_rate": 4.012256973795436e-05, + "loss": 166.3696, + "step": 2337, + "task_loss": 2.766040802001953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499975599433122, + "compression/movement_sparsity/importance_threshold": -7.661849975044294e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9405130582990681, + "compression/movement_sparsity/model_sparsity": 0.9082035413899028, + "compression_loss": 160.23179626464844, + "distillation_loss": 5.656912803649902, + "epoch": 1.98, + "learning_rate": 4.011834319526627e-05, + "loss": 166.3511, + "step": 2338, + "task_loss": 4.997627258300781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999779590583118, + "compression/movement_sparsity/importance_threshold": -6.920920705283384e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.940491415934809, + "compression/movement_sparsity/model_sparsity": 0.9081826425074357, + "compression_loss": 160.23074340820312, + "distillation_loss": 5.92845344543457, + "epoch": 1.98, + "learning_rate": 4.011411665257819e-05, + "loss": 165.1377, + "step": 2339, + "task_loss": 2.1684250831604004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999801614354036, + "compression/movement_sparsity/importance_threshold": -6.229367801972863e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.940503530889127, + "compression/movement_sparsity/model_sparsity": 0.908194341275803, + "compression_loss": 160.2298583984375, + "distillation_loss": 7.2031168937683105, + "epoch": 1.98, + "learning_rate": 4.010989010989011e-05, + "loss": 166.8086, + "step": 2340, + "task_loss": 3.2679431438446045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499982211986746, + "compression/movement_sparsity/importance_threshold": -5.585488631765917e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9404995243688014, + "compression/movement_sparsity/model_sparsity": 0.908190472391776, + "compression_loss": 160.22882080078125, + "distillation_loss": 7.116390228271484, + "epoch": 1.98, + "learning_rate": 4.010566356720203e-05, + "loss": 166.1407, + "step": 2341, + "task_loss": 3.019827127456665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999841161346872, + "compression/movement_sparsity/importance_threshold": -4.987580561575944e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9405863800058609, + "compression/movement_sparsity/model_sparsity": 0.9082743442705035, + "compression_loss": 160.22769165039062, + "distillation_loss": 7.2136454582214355, + "epoch": 1.98, + "learning_rate": 4.010143702451395e-05, + "loss": 165.974, + "step": 2342, + "task_loss": 3.615882158279419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999858793015752, + "compression/movement_sparsity/importance_threshold": -4.433940957795923e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9406937929079249, + "compression/movement_sparsity/model_sparsity": 0.9083780672089407, + "compression_loss": 160.22671508789062, + "distillation_loss": 7.135324954986572, + "epoch": 1.98, + "learning_rate": 4.009721048182587e-05, + "loss": 167.1272, + "step": 2343, + "task_loss": 3.465183973312378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999875069097586, + "compression/movement_sparsity/importance_threshold": -3.9228671872525156e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9407205030434291, + "compression/movement_sparsity/model_sparsity": 0.9084038597691204, + "compression_loss": 160.2256317138672, + "distillation_loss": 6.006848335266113, + "epoch": 1.98, + "learning_rate": 4.009298393913778e-05, + "loss": 165.7744, + "step": 2344, + "task_loss": 2.4421539306640625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999890043815851, + "compression/movement_sparsity/importance_threshold": -3.452656616598909e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9407593996782574, + "compression/movement_sparsity/model_sparsity": 0.9084414201848823, + "compression_loss": 160.2245635986328, + "distillation_loss": 6.364957809448242, + "epoch": 1.98, + "learning_rate": 4.00887573964497e-05, + "loss": 166.7501, + "step": 2345, + "task_loss": 3.831197500228882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999903771394032, + "compression/movement_sparsity/importance_threshold": -3.021606612401556e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9407746864611665, + "compression/movement_sparsity/model_sparsity": 0.9084561818197708, + "compression_loss": 160.22341918945312, + "distillation_loss": 4.808963298797607, + "epoch": 1.98, + "learning_rate": 4.008453085376163e-05, + "loss": 165.3034, + "step": 2346, + "task_loss": 2.1402463912963867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999916306055607, + "compression/movement_sparsity/importance_threshold": -2.628014541487117e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9408673014711942, + "compression/movement_sparsity/model_sparsity": 0.908545615219287, + "compression_loss": 160.22210693359375, + "distillation_loss": 5.781253814697266, + "epoch": 1.98, + "learning_rate": 4.008030431107354e-05, + "loss": 165.6374, + "step": 2347, + "task_loss": 3.0369365215301514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999927702024065, + "compression/movement_sparsity/importance_threshold": -2.2701777704220438e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9408660255852572, + "compression/movement_sparsity/model_sparsity": 0.908544383163957, + "compression_loss": 160.22091674804688, + "distillation_loss": 5.894574165344238, + "epoch": 1.98, + "learning_rate": 4.007607776838546e-05, + "loss": 166.107, + "step": 2348, + "task_loss": 3.9779584407806396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999938013522884, + "compression/movement_sparsity/importance_threshold": -1.946393665859525e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9409916228429658, + "compression/movement_sparsity/model_sparsity": 0.9086656657694809, + "compression_loss": 160.2196807861328, + "distillation_loss": 8.923492431640625, + "epoch": 1.99, + "learning_rate": 4.007185122569738e-05, + "loss": 166.1568, + "step": 2349, + "task_loss": 3.444336414337158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999947294775544, + "compression/movement_sparsity/importance_threshold": -1.6549595946262208e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9410088532651996, + "compression/movement_sparsity/model_sparsity": 0.908682304273704, + "compression_loss": 160.21849060058594, + "distillation_loss": 5.9663190841674805, + "epoch": 1.99, + "learning_rate": 4.00676246830093e-05, + "loss": 166.3487, + "step": 2350, + "task_loss": 3.7416346073150635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999955600005528, + "compression/movement_sparsity/importance_threshold": -1.3941729232885836e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.941048167245895, + "compression/movement_sparsity/model_sparsity": 0.9087202676982186, + "compression_loss": 160.21734619140625, + "distillation_loss": 6.85717248916626, + "epoch": 1.99, + "learning_rate": 4.006339814032122e-05, + "loss": 167.0489, + "step": 2351, + "task_loss": 3.1543943881988525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999962983436324, + "compression/movement_sparsity/importance_threshold": -1.1623310184130653e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.941123682999533, + "compression/movement_sparsity/model_sparsity": 0.9087931892534055, + "compression_loss": 160.21609497070312, + "distillation_loss": 5.350762367248535, + "epoch": 1.99, + "learning_rate": 4.005917159763314e-05, + "loss": 165.9253, + "step": 2352, + "task_loss": 3.19876766204834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999969499291403, + "compression/movement_sparsity/importance_threshold": -9.577312469130628e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9411966112087939, + "compression/movement_sparsity/model_sparsity": 0.9088636121543249, + "compression_loss": 160.21475219726562, + "distillation_loss": 7.298005104064941, + "epoch": 1.99, + "learning_rate": 4.005494505494506e-05, + "loss": 166.4483, + "step": 2353, + "task_loss": 3.664341688156128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999975201794253, + "compression/movement_sparsity/importance_threshold": -7.786709752682919e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9412855416510222, + "compression/movement_sparsity/model_sparsity": 0.9089494875622806, + "compression_loss": 160.21347045898438, + "distillation_loss": 7.00773286819458, + "epoch": 1.99, + "learning_rate": 4.005071851225697e-05, + "loss": 166.6199, + "step": 2354, + "task_loss": 2.653808116912842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499998014516836, + "compression/movement_sparsity/importance_threshold": -6.23447570218677e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9413031417224527, + "compression/movement_sparsity/model_sparsity": 0.9089664830171132, + "compression_loss": 160.21218872070312, + "distillation_loss": 6.630448818206787, + "epoch": 1.99, + "learning_rate": 4.004649196956889e-05, + "loss": 166.6982, + "step": 2355, + "task_loss": 3.0347328186035156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.49999843836372, + "compression/movement_sparsity/importance_threshold": -4.903583984174065e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.941277588231209, + "compression/movement_sparsity/model_sparsity": 0.9089418073669056, + "compression_loss": 160.21099853515625, + "distillation_loss": 6.061424732208252, + "epoch": 1.99, + "learning_rate": 4.004226542688081e-05, + "loss": 166.0519, + "step": 2356, + "task_loss": 2.741102933883667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999987971424256, + "compression/movement_sparsity/importance_threshold": -3.7770082651766845e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9412779101837352, + "compression/movement_sparsity/model_sparsity": 0.908942118259372, + "compression_loss": 160.2096710205078, + "distillation_loss": 7.33541202545166, + "epoch": 1.99, + "learning_rate": 4.003803888419273e-05, + "loss": 166.7486, + "step": 2357, + "task_loss": 3.0860352516174316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999990962753007, + "compression/movement_sparsity/importance_threshold": -2.8377222134612357e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9413075178919751, + "compression/movement_sparsity/model_sparsity": 0.9089707088517499, + "compression_loss": 160.2085418701172, + "distillation_loss": 6.780506610870361, + "epoch": 1.99, + "learning_rate": 4.003381234150465e-05, + "loss": 166.1792, + "step": 2358, + "task_loss": 3.4217426776885986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999993411846944, + "compression/movement_sparsity/importance_threshold": -2.0686994929575153e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9413916667429814, + "compression/movement_sparsity/model_sparsity": 0.909051966930852, + "compression_loss": 160.20724487304688, + "distillation_loss": 7.029179573059082, + "epoch": 1.99, + "learning_rate": 4.002958579881657e-05, + "loss": 166.5161, + "step": 2359, + "task_loss": 3.1922600269317627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499999537292954, + "compression/movement_sparsity/importance_threshold": -1.4529137727994912e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9414161232108025, + "compression/movement_sparsity/model_sparsity": 0.9090755832437666, + "compression_loss": 160.20599365234375, + "distillation_loss": 8.545100212097168, + "epoch": 1.99, + "learning_rate": 4.0025359256128485e-05, + "loss": 166.6691, + "step": 2360, + "task_loss": 3.9525198936462402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999996900224282, + "compression/movement_sparsity/importance_threshold": -9.73338718651684e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9414943457504937, + "compression/movement_sparsity/model_sparsity": 0.9091511185985788, + "compression_loss": 160.20477294921875, + "distillation_loss": 5.64668083190918, + "epoch": 2.0, + "learning_rate": 4.0021132713440405e-05, + "loss": 166.3668, + "step": 2361, + "task_loss": 2.5068724155426025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499999804795465, + "compression/movement_sparsity/importance_threshold": -6.129479979133379e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9415185398866269, + "compression/movement_sparsity/model_sparsity": 0.9091744815917059, + "compression_loss": 160.2035369873047, + "distillation_loss": 8.312286376953125, + "epoch": 2.0, + "learning_rate": 4.0016906170752324e-05, + "loss": 167.4499, + "step": 2362, + "task_loss": 3.8813138008117676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999998870344127, + "compression/movement_sparsity/importance_threshold": -3.547152762489736e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9415324673144256, + "compression/movement_sparsity/model_sparsity": 0.9091879305695139, + "compression_loss": 160.20236206054688, + "distillation_loss": 5.018465995788574, + "epoch": 2.0, + "learning_rate": 4.001267962806425e-05, + "loss": 166.0319, + "step": 2363, + "task_loss": 2.1185457706451416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999999421616192, + "compression/movement_sparsity/importance_threshold": -1.8161422192519705e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9415084043441364, + "compression/movement_sparsity/model_sparsity": 0.9091646942362805, + "compression_loss": 160.201171875, + "distillation_loss": 5.742884635925293, + "epoch": 2.0, + "learning_rate": 4.0008453085376164e-05, + "loss": 165.8484, + "step": 2364, + "task_loss": 3.038019895553589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499999975599433, + "compression/movement_sparsity/importance_threshold": -7.661849973916723e-11, + "compression/movement_sparsity/linear_layer_sparsity": 0.9414946438546846, + "compression/movement_sparsity/model_sparsity": 0.9091514064619736, + "compression_loss": 160.199951171875, + "distillation_loss": 6.367249488830566, + "epoch": 2.0, + "learning_rate": 4.0004226542688084e-05, + "loss": 166.3348, + "step": 2365, + "task_loss": 2.932345390319824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999999927702024, + "compression/movement_sparsity/importance_threshold": -2.2701777957490066e-11, + "compression/movement_sparsity/linear_layer_sparsity": 0.9415118027319127, + "compression/movement_sparsity/model_sparsity": 0.9091679758789819, + "compression_loss": 160.19873046875, + "distillation_loss": 7.022960186004639, + "epoch": 2.0, + "learning_rate": 4e-05, + "loss": 165.5297, + "step": 2366, + "task_loss": 2.767848491668701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 6.727951526641846, + "epoch": 2.0, + "learning_rate": 3.9995773457311916e-05, + "loss": 131.5092, + "step": 2367, + "task_loss": 3.372413158416748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 5.7696685791015625, + "epoch": 2.0, + "learning_rate": 3.999154691462384e-05, + "loss": 5.3427, + "step": 2368, + "task_loss": 2.0392656326293945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 5.515368938446045, + "epoch": 2.0, + "learning_rate": 3.998732037193576e-05, + "loss": 5.4626, + "step": 2369, + "task_loss": 2.3784356117248535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 5.478196144104004, + "epoch": 2.0, + "learning_rate": 3.9983093829247675e-05, + "loss": 4.8103, + "step": 2370, + "task_loss": 2.407582998275757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 4.114841461181641, + "epoch": 2.0, + "learning_rate": 3.9978867286559595e-05, + "loss": 3.4251, + "step": 2371, + "task_loss": 2.1522226333618164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 4.007223606109619, + "epoch": 2.01, + "learning_rate": 3.9974640743871515e-05, + "loss": 3.8089, + "step": 2372, + "task_loss": 2.4861936569213867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 4.57686710357666, + "epoch": 2.01, + "learning_rate": 3.9970414201183435e-05, + "loss": 3.7541, + "step": 2373, + "task_loss": 1.73525071144104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 3.09799861907959, + "epoch": 2.01, + "learning_rate": 3.9966187658495354e-05, + "loss": 3.3156, + "step": 2374, + "task_loss": 1.3507612943649292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.0171287059783936, + "epoch": 2.01, + "learning_rate": 3.9961961115807274e-05, + "loss": 3.0909, + "step": 2375, + "task_loss": 1.0064871311187744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 3.3522822856903076, + "epoch": 2.01, + "learning_rate": 3.995773457311919e-05, + "loss": 2.4917, + "step": 2376, + "task_loss": 1.5581567287445068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.805544853210449, + "epoch": 2.01, + "learning_rate": 3.995350803043111e-05, + "loss": 3.1072, + "step": 2377, + "task_loss": 1.9875231981277466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 3.0497374534606934, + "epoch": 2.01, + "learning_rate": 3.9949281487743027e-05, + "loss": 2.807, + "step": 2378, + "task_loss": 0.9842453598976135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 3.2810075283050537, + "epoch": 2.01, + "learning_rate": 3.9945054945054946e-05, + "loss": 3.0397, + "step": 2379, + "task_loss": 1.8561878204345703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 3.6260640621185303, + "epoch": 2.01, + "learning_rate": 3.9940828402366866e-05, + "loss": 2.7026, + "step": 2380, + "task_loss": 1.3060163259506226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.07041335105896, + "epoch": 2.01, + "learning_rate": 3.9936601859678786e-05, + "loss": 2.7868, + "step": 2381, + "task_loss": 1.4494351148605347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.1373894214630127, + "epoch": 2.01, + "learning_rate": 3.9932375316990706e-05, + "loss": 2.22, + "step": 2382, + "task_loss": 1.4277929067611694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 3.55580997467041, + "epoch": 2.01, + "learning_rate": 3.992814877430262e-05, + "loss": 2.364, + "step": 2383, + "task_loss": 1.6325865983963013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.385978937149048, + "epoch": 2.02, + "learning_rate": 3.992392223161454e-05, + "loss": 1.911, + "step": 2384, + "task_loss": 1.5452477931976318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 3.232536792755127, + "epoch": 2.02, + "learning_rate": 3.9919695688926465e-05, + "loss": 2.6697, + "step": 2385, + "task_loss": 1.540758728981018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.6594595909118652, + "epoch": 2.02, + "learning_rate": 3.991546914623838e-05, + "loss": 2.5036, + "step": 2386, + "task_loss": 1.640219807624817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7822601795196533, + "epoch": 2.02, + "learning_rate": 3.99112426035503e-05, + "loss": 2.1458, + "step": 2387, + "task_loss": 1.0330513715744019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.7828869819641113, + "epoch": 2.02, + "learning_rate": 3.990701606086222e-05, + "loss": 2.0729, + "step": 2388, + "task_loss": 0.884263277053833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.0850987434387207, + "epoch": 2.02, + "learning_rate": 3.990278951817413e-05, + "loss": 2.4954, + "step": 2389, + "task_loss": 1.7540827989578247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5429188013076782, + "epoch": 2.02, + "learning_rate": 3.989856297548606e-05, + "loss": 1.9325, + "step": 2390, + "task_loss": 1.5614439249038696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.464088797569275, + "epoch": 2.02, + "learning_rate": 3.9894336432797976e-05, + "loss": 1.6323, + "step": 2391, + "task_loss": 1.041124939918518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.936899185180664, + "epoch": 2.02, + "learning_rate": 3.9890109890109896e-05, + "loss": 2.2428, + "step": 2392, + "task_loss": 1.192954421043396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7787879705429077, + "epoch": 2.02, + "learning_rate": 3.988588334742181e-05, + "loss": 1.5976, + "step": 2393, + "task_loss": 2.2010862827301025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.483182668685913, + "epoch": 2.02, + "learning_rate": 3.988165680473373e-05, + "loss": 2.2957, + "step": 2394, + "task_loss": 1.8986337184906006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 3.556488513946533, + "epoch": 2.02, + "learning_rate": 3.987743026204565e-05, + "loss": 2.3439, + "step": 2395, + "task_loss": 1.7637701034545898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.783862590789795, + "epoch": 2.03, + "learning_rate": 3.987320371935757e-05, + "loss": 1.8387, + "step": 2396, + "task_loss": 1.8849520683288574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7669353485107422, + "epoch": 2.03, + "learning_rate": 3.986897717666949e-05, + "loss": 1.7569, + "step": 2397, + "task_loss": 1.0393316745758057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2654008865356445, + "epoch": 2.03, + "learning_rate": 3.986475063398141e-05, + "loss": 1.622, + "step": 2398, + "task_loss": 1.1282707452774048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6770771741867065, + "epoch": 2.03, + "learning_rate": 3.986052409129332e-05, + "loss": 1.7897, + "step": 2399, + "task_loss": 2.0706143379211426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.143314838409424, + "epoch": 2.03, + "learning_rate": 3.985629754860524e-05, + "loss": 2.0636, + "step": 2400, + "task_loss": 1.0722198486328125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.695676326751709, + "epoch": 2.03, + "learning_rate": 3.985207100591716e-05, + "loss": 1.7997, + "step": 2401, + "task_loss": 1.2164547443389893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.676777720451355, + "epoch": 2.03, + "learning_rate": 3.984784446322908e-05, + "loss": 1.6675, + "step": 2402, + "task_loss": 1.9163891077041626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.76197350025177, + "epoch": 2.03, + "learning_rate": 3.9843617920541e-05, + "loss": 1.7632, + "step": 2403, + "task_loss": 1.0540944337844849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4015312194824219, + "epoch": 2.03, + "learning_rate": 3.983939137785292e-05, + "loss": 1.664, + "step": 2404, + "task_loss": 2.305607557296753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7427690029144287, + "epoch": 2.03, + "learning_rate": 3.983516483516483e-05, + "loss": 1.801, + "step": 2405, + "task_loss": 0.9954270124435425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2429957389831543, + "epoch": 2.03, + "learning_rate": 3.983093829247675e-05, + "loss": 1.3147, + "step": 2406, + "task_loss": 0.6306697726249695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5663480758666992, + "epoch": 2.03, + "learning_rate": 3.982671174978868e-05, + "loss": 1.7001, + "step": 2407, + "task_loss": 0.8829794526100159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5441477298736572, + "epoch": 2.04, + "learning_rate": 3.98224852071006e-05, + "loss": 1.5441, + "step": 2408, + "task_loss": 1.432923674583435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4914753437042236, + "epoch": 2.04, + "learning_rate": 3.981825866441251e-05, + "loss": 1.574, + "step": 2409, + "task_loss": 1.3204939365386963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2094545364379883, + "epoch": 2.04, + "learning_rate": 3.981403212172443e-05, + "loss": 1.7408, + "step": 2410, + "task_loss": 0.5181313157081604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5648179054260254, + "epoch": 2.04, + "learning_rate": 3.980980557903635e-05, + "loss": 1.5979, + "step": 2411, + "task_loss": 1.1175904273986816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3891541957855225, + "epoch": 2.04, + "learning_rate": 3.980557903634827e-05, + "loss": 1.5099, + "step": 2412, + "task_loss": 1.4110456705093384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3231825828552246, + "epoch": 2.04, + "learning_rate": 3.980135249366019e-05, + "loss": 1.7478, + "step": 2413, + "task_loss": 0.7278585433959961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2702136039733887, + "epoch": 2.04, + "learning_rate": 3.979712595097211e-05, + "loss": 0.9821, + "step": 2414, + "task_loss": 0.8774385452270508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.9826806783676147, + "epoch": 2.04, + "learning_rate": 3.979289940828402e-05, + "loss": 1.8279, + "step": 2415, + "task_loss": 1.3961982727050781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2732384204864502, + "epoch": 2.04, + "learning_rate": 3.978867286559594e-05, + "loss": 1.5267, + "step": 2416, + "task_loss": 1.3630156517028809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1873409748077393, + "epoch": 2.04, + "learning_rate": 3.978444632290786e-05, + "loss": 1.355, + "step": 2417, + "task_loss": 0.8526145815849304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3411285877227783, + "epoch": 2.04, + "learning_rate": 3.978021978021978e-05, + "loss": 1.4467, + "step": 2418, + "task_loss": 0.6982748508453369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.341094732284546, + "epoch": 2.04, + "learning_rate": 3.97759932375317e-05, + "loss": 1.7472, + "step": 2419, + "task_loss": 0.5854294896125793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3771424293518066, + "epoch": 2.05, + "learning_rate": 3.977176669484362e-05, + "loss": 1.4416, + "step": 2420, + "task_loss": 0.8746435642242432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.424363374710083, + "epoch": 2.05, + "learning_rate": 3.976754015215554e-05, + "loss": 1.7909, + "step": 2421, + "task_loss": 0.8313206434249878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.655148506164551, + "epoch": 2.05, + "learning_rate": 3.9763313609467454e-05, + "loss": 1.826, + "step": 2422, + "task_loss": 2.3594305515289307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2628995180130005, + "epoch": 2.05, + "learning_rate": 3.9759087066779374e-05, + "loss": 1.6797, + "step": 2423, + "task_loss": 0.7806808948516846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3123974800109863, + "epoch": 2.05, + "learning_rate": 3.97548605240913e-05, + "loss": 1.5577, + "step": 2424, + "task_loss": 0.5960263609886169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.8879499435424805, + "epoch": 2.05, + "learning_rate": 3.9750633981403214e-05, + "loss": 1.6582, + "step": 2425, + "task_loss": 1.5674095153808594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.2396693229675293, + "epoch": 2.05, + "learning_rate": 3.974640743871513e-05, + "loss": 1.823, + "step": 2426, + "task_loss": 2.3232245445251465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.502490758895874, + "epoch": 2.05, + "learning_rate": 3.974218089602705e-05, + "loss": 1.2049, + "step": 2427, + "task_loss": 1.333996295928955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2393107414245605, + "epoch": 2.05, + "learning_rate": 3.9737954353338966e-05, + "loss": 1.4193, + "step": 2428, + "task_loss": 1.0256779193878174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.13899564743042, + "epoch": 2.05, + "learning_rate": 3.973372781065089e-05, + "loss": 1.5823, + "step": 2429, + "task_loss": 0.773370623588562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5242952108383179, + "epoch": 2.05, + "learning_rate": 3.972950126796281e-05, + "loss": 1.5033, + "step": 2430, + "task_loss": 1.3092904090881348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3226540088653564, + "epoch": 2.05, + "learning_rate": 3.9725274725274725e-05, + "loss": 1.6954, + "step": 2431, + "task_loss": 0.9623933434486389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.5996503829956055, + "epoch": 2.06, + "learning_rate": 3.9721048182586645e-05, + "loss": 1.8684, + "step": 2432, + "task_loss": 1.4465861320495605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6925464868545532, + "epoch": 2.06, + "learning_rate": 3.9716821639898565e-05, + "loss": 1.1295, + "step": 2433, + "task_loss": 1.0429103374481201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8419778347015381, + "epoch": 2.06, + "learning_rate": 3.971259509721048e-05, + "loss": 1.2644, + "step": 2434, + "task_loss": 0.8587064146995544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.551980972290039, + "epoch": 2.06, + "learning_rate": 3.9708368554522404e-05, + "loss": 1.4314, + "step": 2435, + "task_loss": 1.6084686517715454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6862212419509888, + "epoch": 2.06, + "learning_rate": 3.9704142011834324e-05, + "loss": 1.652, + "step": 2436, + "task_loss": 1.7060869932174683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3340411186218262, + "epoch": 2.06, + "learning_rate": 3.9699915469146244e-05, + "loss": 1.4734, + "step": 2437, + "task_loss": 0.697928249835968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0594673156738281, + "epoch": 2.06, + "learning_rate": 3.969568892645816e-05, + "loss": 1.5607, + "step": 2438, + "task_loss": 1.1826533079147339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7003694772720337, + "epoch": 2.06, + "learning_rate": 3.9691462383770076e-05, + "loss": 1.8563, + "step": 2439, + "task_loss": 1.1646924018859863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4151952266693115, + "epoch": 2.06, + "learning_rate": 3.9687235841081996e-05, + "loss": 1.4569, + "step": 2440, + "task_loss": 1.145049810409546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6371992826461792, + "epoch": 2.06, + "learning_rate": 3.9683009298393916e-05, + "loss": 1.3912, + "step": 2441, + "task_loss": 0.9268471598625183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.064062237739563, + "epoch": 2.06, + "learning_rate": 3.9678782755705836e-05, + "loss": 1.2124, + "step": 2442, + "task_loss": 0.448896199464798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.528700828552246, + "epoch": 2.07, + "learning_rate": 3.9674556213017755e-05, + "loss": 1.6382, + "step": 2443, + "task_loss": 2.4253177642822266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.474086046218872, + "epoch": 2.07, + "learning_rate": 3.967032967032967e-05, + "loss": 1.2923, + "step": 2444, + "task_loss": 1.7279953956604004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.8257012367248535, + "epoch": 2.07, + "learning_rate": 3.966610312764159e-05, + "loss": 1.5224, + "step": 2445, + "task_loss": 1.5101815462112427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.758450984954834, + "epoch": 2.07, + "learning_rate": 3.9661876584953515e-05, + "loss": 1.4327, + "step": 2446, + "task_loss": 1.8750847578048706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.162940502166748, + "epoch": 2.07, + "learning_rate": 3.965765004226543e-05, + "loss": 1.3452, + "step": 2447, + "task_loss": 0.647018313407898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.1899538040161133, + "epoch": 2.07, + "learning_rate": 3.965342349957735e-05, + "loss": 1.5344, + "step": 2448, + "task_loss": 1.8895554542541504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.339575171470642, + "epoch": 2.07, + "learning_rate": 3.964919695688927e-05, + "loss": 1.0786, + "step": 2449, + "task_loss": 0.9051080942153931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.655377984046936, + "epoch": 2.07, + "learning_rate": 3.964497041420119e-05, + "loss": 1.0924, + "step": 2450, + "task_loss": 0.20561100542545319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7912007570266724, + "epoch": 2.07, + "learning_rate": 3.96407438715131e-05, + "loss": 1.1015, + "step": 2451, + "task_loss": 0.369274765253067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7987916469573975, + "epoch": 2.07, + "learning_rate": 3.9636517328825026e-05, + "loss": 1.3633, + "step": 2452, + "task_loss": 2.451631546020508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5714248418807983, + "epoch": 2.07, + "learning_rate": 3.9632290786136946e-05, + "loss": 1.3145, + "step": 2453, + "task_loss": 1.3857307434082031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6587679386138916, + "epoch": 2.07, + "learning_rate": 3.962806424344886e-05, + "loss": 1.5088, + "step": 2454, + "task_loss": 0.6644378900527954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5347239971160889, + "epoch": 2.08, + "learning_rate": 3.962383770076078e-05, + "loss": 1.4524, + "step": 2455, + "task_loss": 1.3674412965774536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7437262535095215, + "epoch": 2.08, + "learning_rate": 3.96196111580727e-05, + "loss": 1.3974, + "step": 2456, + "task_loss": 1.4345266819000244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4868624210357666, + "epoch": 2.08, + "learning_rate": 3.961538461538462e-05, + "loss": 1.1858, + "step": 2457, + "task_loss": 1.7337347269058228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6236227750778198, + "epoch": 2.08, + "learning_rate": 3.961115807269654e-05, + "loss": 1.1912, + "step": 2458, + "task_loss": 1.0803594589233398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4364078044891357, + "epoch": 2.08, + "learning_rate": 3.960693153000846e-05, + "loss": 1.3049, + "step": 2459, + "task_loss": 1.3383210897445679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.143260955810547, + "epoch": 2.08, + "learning_rate": 3.960270498732037e-05, + "loss": 1.5919, + "step": 2460, + "task_loss": 1.4985262155532837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5759220123291016, + "epoch": 2.08, + "learning_rate": 3.959847844463229e-05, + "loss": 1.627, + "step": 2461, + "task_loss": 1.5590928792953491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4170928001403809, + "epoch": 2.08, + "learning_rate": 3.959425190194421e-05, + "loss": 1.1669, + "step": 2462, + "task_loss": 0.2683714032173157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9110627174377441, + "epoch": 2.08, + "learning_rate": 3.959002535925613e-05, + "loss": 1.6216, + "step": 2463, + "task_loss": 0.4389840066432953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7188645601272583, + "epoch": 2.08, + "learning_rate": 3.958579881656805e-05, + "loss": 1.2045, + "step": 2464, + "task_loss": 1.0582202672958374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7483444213867188, + "epoch": 2.08, + "learning_rate": 3.958157227387997e-05, + "loss": 1.6748, + "step": 2465, + "task_loss": 1.5068576335906982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5472084283828735, + "epoch": 2.08, + "learning_rate": 3.957734573119189e-05, + "loss": 1.1954, + "step": 2466, + "task_loss": 1.5050468444824219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.366323709487915, + "epoch": 2.09, + "learning_rate": 3.95731191885038e-05, + "loss": 1.2196, + "step": 2467, + "task_loss": 1.2034974098205566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.377177357673645, + "epoch": 2.09, + "learning_rate": 3.956889264581572e-05, + "loss": 1.176, + "step": 2468, + "task_loss": 0.8319675326347351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0718408823013306, + "epoch": 2.09, + "learning_rate": 3.956466610312765e-05, + "loss": 1.0282, + "step": 2469, + "task_loss": 1.5994333028793335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6089887619018555, + "epoch": 2.09, + "learning_rate": 3.956043956043956e-05, + "loss": 1.2418, + "step": 2470, + "task_loss": 1.8430551290512085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9888797402381897, + "epoch": 2.09, + "learning_rate": 3.955621301775148e-05, + "loss": 1.237, + "step": 2471, + "task_loss": 1.0910674333572388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0491304397583008, + "epoch": 2.09, + "learning_rate": 3.95519864750634e-05, + "loss": 1.6628, + "step": 2472, + "task_loss": 1.254642128944397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3919920921325684, + "epoch": 2.09, + "learning_rate": 3.9547759932375314e-05, + "loss": 1.405, + "step": 2473, + "task_loss": 1.1151994466781616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.9127373695373535, + "epoch": 2.09, + "learning_rate": 3.954353338968724e-05, + "loss": 1.4587, + "step": 2474, + "task_loss": 1.230209231376648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8920097947120667, + "epoch": 2.09, + "learning_rate": 3.953930684699916e-05, + "loss": 1.2473, + "step": 2475, + "task_loss": 0.7124837636947632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8591822385787964, + "epoch": 2.09, + "learning_rate": 3.953508030431107e-05, + "loss": 1.2194, + "step": 2476, + "task_loss": 0.4764343500137329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.038776159286499, + "epoch": 2.09, + "learning_rate": 3.953085376162299e-05, + "loss": 1.3827, + "step": 2477, + "task_loss": 1.8782124519348145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1613956689834595, + "epoch": 2.09, + "learning_rate": 3.952662721893491e-05, + "loss": 1.3358, + "step": 2478, + "task_loss": 0.9608829617500305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5416390895843506, + "epoch": 2.1, + "learning_rate": 3.952240067624683e-05, + "loss": 1.0554, + "step": 2479, + "task_loss": 0.8468848466873169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4891023635864258, + "epoch": 2.1, + "learning_rate": 3.951817413355875e-05, + "loss": 1.4654, + "step": 2480, + "task_loss": 0.847949743270874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.405877709388733, + "epoch": 2.1, + "learning_rate": 3.951394759087067e-05, + "loss": 1.4796, + "step": 2481, + "task_loss": 1.156759262084961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9040069580078125, + "epoch": 2.1, + "learning_rate": 3.950972104818259e-05, + "loss": 1.0208, + "step": 2482, + "task_loss": 0.6528639793395996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6594023704528809, + "epoch": 2.1, + "learning_rate": 3.9505494505494504e-05, + "loss": 1.2315, + "step": 2483, + "task_loss": 0.9274123907089233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0847699642181396, + "epoch": 2.1, + "learning_rate": 3.9501267962806424e-05, + "loss": 0.9808, + "step": 2484, + "task_loss": 1.0958430767059326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1887664794921875, + "epoch": 2.1, + "learning_rate": 3.9497041420118344e-05, + "loss": 1.4002, + "step": 2485, + "task_loss": 0.9094762206077576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7207974791526794, + "epoch": 2.1, + "learning_rate": 3.9492814877430263e-05, + "loss": 1.5657, + "step": 2486, + "task_loss": 1.386243224143982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.3925623893737793, + "epoch": 2.1, + "learning_rate": 3.948858833474218e-05, + "loss": 1.6115, + "step": 2487, + "task_loss": 2.2891972064971924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 3.0303375720977783, + "epoch": 2.1, + "learning_rate": 3.94843617920541e-05, + "loss": 1.9384, + "step": 2488, + "task_loss": 3.155574321746826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.9076354503631592, + "epoch": 2.1, + "learning_rate": 3.9480135249366016e-05, + "loss": 1.5362, + "step": 2489, + "task_loss": 2.11348295211792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4189292192459106, + "epoch": 2.1, + "learning_rate": 3.9475908706677936e-05, + "loss": 1.2295, + "step": 2490, + "task_loss": 0.961495041847229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0058842897415161, + "epoch": 2.11, + "learning_rate": 3.947168216398986e-05, + "loss": 1.2178, + "step": 2491, + "task_loss": 1.0458797216415405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9723291397094727, + "epoch": 2.11, + "learning_rate": 3.9467455621301775e-05, + "loss": 0.9829, + "step": 2492, + "task_loss": 1.7455593347549438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0847389698028564, + "epoch": 2.11, + "learning_rate": 3.9463229078613695e-05, + "loss": 1.5082, + "step": 2493, + "task_loss": 1.2623860836029053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0085761547088623, + "epoch": 2.11, + "learning_rate": 3.9459002535925615e-05, + "loss": 1.0233, + "step": 2494, + "task_loss": 0.835718035697937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9100750088691711, + "epoch": 2.11, + "learning_rate": 3.9454775993237534e-05, + "loss": 1.0876, + "step": 2495, + "task_loss": 0.506028950214386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7700991630554199, + "epoch": 2.11, + "learning_rate": 3.9450549450549454e-05, + "loss": 1.1632, + "step": 2496, + "task_loss": 0.43916288018226624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.519012212753296, + "epoch": 2.11, + "learning_rate": 3.9446322907861374e-05, + "loss": 1.1096, + "step": 2497, + "task_loss": 1.6551190614700317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8177735805511475, + "epoch": 2.11, + "learning_rate": 3.9442096365173294e-05, + "loss": 1.0053, + "step": 2498, + "task_loss": 1.852524995803833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.9728920459747314, + "epoch": 2.11, + "learning_rate": 3.9437869822485207e-05, + "loss": 1.2923, + "step": 2499, + "task_loss": 0.9941496849060059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2772374153137207, + "epoch": 2.11, + "learning_rate": 3.9433643279797126e-05, + "loss": 1.061, + "step": 2500, + "task_loss": 0.5361504554748535 + }, + { + "epoch": 2.11, + "eval_accuracy": 0.855049504950495, + "eval_loss": 0.752260684967041, + "eval_runtime": 228.5962, + "eval_samples_per_second": 110.457, + "eval_steps_per_second": 0.866, + "step": 2500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5476107597351074, + "epoch": 2.11, + "learning_rate": 3.9429416737109046e-05, + "loss": 0.8758, + "step": 2501, + "task_loss": 0.9209589958190918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5106786489486694, + "epoch": 2.11, + "learning_rate": 3.9425190194420966e-05, + "loss": 1.3833, + "step": 2502, + "task_loss": 0.9346505403518677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4420406818389893, + "epoch": 2.12, + "learning_rate": 3.9420963651732885e-05, + "loss": 1.3425, + "step": 2503, + "task_loss": 0.6034866571426392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0130925178527832, + "epoch": 2.12, + "learning_rate": 3.9416737109044805e-05, + "loss": 1.4086, + "step": 2504, + "task_loss": 0.5354030728340149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9626778364181519, + "epoch": 2.12, + "learning_rate": 3.941251056635672e-05, + "loss": 1.0621, + "step": 2505, + "task_loss": 1.0288535356521606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.411634087562561, + "epoch": 2.12, + "learning_rate": 3.940828402366864e-05, + "loss": 1.4817, + "step": 2506, + "task_loss": 1.0778255462646484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5689705610275269, + "epoch": 2.12, + "learning_rate": 3.940405748098056e-05, + "loss": 1.2598, + "step": 2507, + "task_loss": 0.6316152811050415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.121328592300415, + "epoch": 2.12, + "learning_rate": 3.9399830938292484e-05, + "loss": 1.124, + "step": 2508, + "task_loss": 1.2931735515594482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6839990615844727, + "epoch": 2.12, + "learning_rate": 3.93956043956044e-05, + "loss": 1.3684, + "step": 2509, + "task_loss": 1.3924341201782227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.210131287574768, + "epoch": 2.12, + "learning_rate": 3.939137785291632e-05, + "loss": 1.0921, + "step": 2510, + "task_loss": 0.4956583082675934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.299082636833191, + "epoch": 2.12, + "learning_rate": 3.938715131022824e-05, + "loss": 1.5506, + "step": 2511, + "task_loss": 1.6400787830352783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9118422269821167, + "epoch": 2.12, + "learning_rate": 3.938292476754015e-05, + "loss": 1.3016, + "step": 2512, + "task_loss": 0.7700437903404236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.201402187347412, + "epoch": 2.12, + "learning_rate": 3.9378698224852076e-05, + "loss": 0.9514, + "step": 2513, + "task_loss": 0.7935715317726135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6507834196090698, + "epoch": 2.13, + "learning_rate": 3.9374471682163996e-05, + "loss": 1.3384, + "step": 2514, + "task_loss": 1.067337989807129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.206497311592102, + "epoch": 2.13, + "learning_rate": 3.937024513947591e-05, + "loss": 1.1819, + "step": 2515, + "task_loss": 1.0327556133270264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6909717321395874, + "epoch": 2.13, + "learning_rate": 3.936601859678783e-05, + "loss": 1.2015, + "step": 2516, + "task_loss": 2.1916956901550293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1243207454681396, + "epoch": 2.13, + "learning_rate": 3.936179205409975e-05, + "loss": 0.8962, + "step": 2517, + "task_loss": 0.9569413661956787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1356823444366455, + "epoch": 2.13, + "learning_rate": 3.935756551141167e-05, + "loss": 1.1087, + "step": 2518, + "task_loss": 0.3992585241794586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7975798845291138, + "epoch": 2.13, + "learning_rate": 3.935333896872359e-05, + "loss": 1.2118, + "step": 2519, + "task_loss": 1.6944652795791626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.9308773279190063, + "epoch": 2.13, + "learning_rate": 3.934911242603551e-05, + "loss": 1.3052, + "step": 2520, + "task_loss": 1.083774209022522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2401868104934692, + "epoch": 2.13, + "learning_rate": 3.934488588334742e-05, + "loss": 1.3036, + "step": 2521, + "task_loss": 1.7014265060424805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6791520714759827, + "epoch": 2.13, + "learning_rate": 3.934065934065934e-05, + "loss": 1.2344, + "step": 2522, + "task_loss": 1.699588418006897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1963629722595215, + "epoch": 2.13, + "learning_rate": 3.933643279797126e-05, + "loss": 1.1654, + "step": 2523, + "task_loss": 0.5291019082069397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.035501480102539, + "epoch": 2.13, + "learning_rate": 3.933220625528318e-05, + "loss": 1.2282, + "step": 2524, + "task_loss": 1.068690538406372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.8149365186691284, + "epoch": 2.13, + "learning_rate": 3.93279797125951e-05, + "loss": 1.2286, + "step": 2525, + "task_loss": 1.7638338804244995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9684966206550598, + "epoch": 2.14, + "learning_rate": 3.932375316990702e-05, + "loss": 1.0901, + "step": 2526, + "task_loss": 0.9050475358963013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2119160890579224, + "epoch": 2.14, + "learning_rate": 3.931952662721894e-05, + "loss": 1.0866, + "step": 2527, + "task_loss": 0.8774319291114807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1510636806488037, + "epoch": 2.14, + "learning_rate": 3.931530008453085e-05, + "loss": 0.9694, + "step": 2528, + "task_loss": 1.1399471759796143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4225149154663086, + "epoch": 2.14, + "learning_rate": 3.931107354184277e-05, + "loss": 1.1414, + "step": 2529, + "task_loss": 1.183040738105774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8627368211746216, + "epoch": 2.14, + "learning_rate": 3.93068469991547e-05, + "loss": 1.2972, + "step": 2530, + "task_loss": 1.6322598457336426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3751482963562012, + "epoch": 2.14, + "learning_rate": 3.930262045646661e-05, + "loss": 1.4511, + "step": 2531, + "task_loss": 1.8769856691360474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5390530824661255, + "epoch": 2.14, + "learning_rate": 3.929839391377853e-05, + "loss": 1.1242, + "step": 2532, + "task_loss": 1.0099105834960938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3190017938613892, + "epoch": 2.14, + "learning_rate": 3.929416737109045e-05, + "loss": 1.1558, + "step": 2533, + "task_loss": 2.2437427043914795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2159230709075928, + "epoch": 2.14, + "learning_rate": 3.9289940828402364e-05, + "loss": 1.1851, + "step": 2534, + "task_loss": 1.2412211894989014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4506877660751343, + "epoch": 2.14, + "learning_rate": 3.928571428571429e-05, + "loss": 1.04, + "step": 2535, + "task_loss": 0.6821317672729492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6746113300323486, + "epoch": 2.14, + "learning_rate": 3.928148774302621e-05, + "loss": 1.0522, + "step": 2536, + "task_loss": 1.076552391052246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6370927095413208, + "epoch": 2.14, + "learning_rate": 3.927726120033813e-05, + "loss": 1.4058, + "step": 2537, + "task_loss": 1.557195782661438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0421838760375977, + "epoch": 2.15, + "learning_rate": 3.927303465765004e-05, + "loss": 1.1944, + "step": 2538, + "task_loss": 0.919719398021698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8270502686500549, + "epoch": 2.15, + "learning_rate": 3.926880811496196e-05, + "loss": 1.304, + "step": 2539, + "task_loss": 0.5443058013916016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8390494585037231, + "epoch": 2.15, + "learning_rate": 3.926458157227388e-05, + "loss": 1.0278, + "step": 2540, + "task_loss": 0.420135498046875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.151467204093933, + "epoch": 2.15, + "learning_rate": 3.92603550295858e-05, + "loss": 1.1576, + "step": 2541, + "task_loss": 1.2587696313858032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9899442195892334, + "epoch": 2.15, + "learning_rate": 3.925612848689772e-05, + "loss": 1.04, + "step": 2542, + "task_loss": 1.562186360359192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1969377994537354, + "epoch": 2.15, + "learning_rate": 3.925190194420964e-05, + "loss": 1.3074, + "step": 2543, + "task_loss": 1.1316585540771484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0923492908477783, + "epoch": 2.15, + "learning_rate": 3.9247675401521554e-05, + "loss": 1.1587, + "step": 2544, + "task_loss": 0.2309933602809906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1078921556472778, + "epoch": 2.15, + "learning_rate": 3.9243448858833474e-05, + "loss": 1.0046, + "step": 2545, + "task_loss": 1.6790353059768677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0529794692993164, + "epoch": 2.15, + "learning_rate": 3.9239222316145394e-05, + "loss": 1.0305, + "step": 2546, + "task_loss": 1.0452624559402466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7505550384521484, + "epoch": 2.15, + "learning_rate": 3.923499577345731e-05, + "loss": 1.2823, + "step": 2547, + "task_loss": 1.2958886623382568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1431958675384521, + "epoch": 2.15, + "learning_rate": 3.923076923076923e-05, + "loss": 1.0215, + "step": 2548, + "task_loss": 0.5999258756637573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.031099796295166, + "epoch": 2.15, + "learning_rate": 3.922654268808115e-05, + "loss": 1.3228, + "step": 2549, + "task_loss": 1.6370066404342651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4943218231201172, + "epoch": 2.16, + "learning_rate": 3.9222316145393066e-05, + "loss": 1.2516, + "step": 2550, + "task_loss": 1.4671874046325684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8828465342521667, + "epoch": 2.16, + "learning_rate": 3.9218089602704986e-05, + "loss": 1.0493, + "step": 2551, + "task_loss": 0.6104852557182312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9021772146224976, + "epoch": 2.16, + "learning_rate": 3.921386306001691e-05, + "loss": 1.0642, + "step": 2552, + "task_loss": 1.3527050018310547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.170947551727295, + "epoch": 2.16, + "learning_rate": 3.920963651732883e-05, + "loss": 0.949, + "step": 2553, + "task_loss": 1.1193286180496216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.063212275505066, + "epoch": 2.16, + "learning_rate": 3.9205409974640745e-05, + "loss": 0.9539, + "step": 2554, + "task_loss": 0.4321344494819641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1940295696258545, + "epoch": 2.16, + "learning_rate": 3.9201183431952664e-05, + "loss": 1.1096, + "step": 2555, + "task_loss": 0.8717199563980103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4430387020111084, + "epoch": 2.16, + "learning_rate": 3.9196956889264584e-05, + "loss": 1.293, + "step": 2556, + "task_loss": 1.941658616065979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.499438762664795, + "epoch": 2.16, + "learning_rate": 3.9192730346576504e-05, + "loss": 1.2446, + "step": 2557, + "task_loss": 2.039232015609741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.101204752922058, + "epoch": 2.16, + "learning_rate": 3.9188503803888424e-05, + "loss": 0.8893, + "step": 2558, + "task_loss": 0.7103560566902161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0475131273269653, + "epoch": 2.16, + "learning_rate": 3.9184277261200343e-05, + "loss": 0.9922, + "step": 2559, + "task_loss": 2.1834683418273926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2570480108261108, + "epoch": 2.16, + "learning_rate": 3.9180050718512256e-05, + "loss": 1.3178, + "step": 2560, + "task_loss": 0.6201184988021851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.056185007095337, + "epoch": 2.16, + "learning_rate": 3.9175824175824176e-05, + "loss": 0.8466, + "step": 2561, + "task_loss": 0.4680480360984802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.105933427810669, + "epoch": 2.17, + "learning_rate": 3.9171597633136096e-05, + "loss": 1.348, + "step": 2562, + "task_loss": 1.060158610343933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2494783401489258, + "epoch": 2.17, + "learning_rate": 3.9167371090448016e-05, + "loss": 1.0974, + "step": 2563, + "task_loss": 0.6414755582809448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8980710506439209, + "epoch": 2.17, + "learning_rate": 3.9163144547759935e-05, + "loss": 0.8759, + "step": 2564, + "task_loss": 1.0450607538223267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9811490774154663, + "epoch": 2.17, + "learning_rate": 3.9158918005071855e-05, + "loss": 1.058, + "step": 2565, + "task_loss": 0.25542783737182617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7080172300338745, + "epoch": 2.17, + "learning_rate": 3.9154691462383775e-05, + "loss": 1.1566, + "step": 2566, + "task_loss": 1.6732734441757202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.152398705482483, + "epoch": 2.17, + "learning_rate": 3.915046491969569e-05, + "loss": 1.3252, + "step": 2567, + "task_loss": 1.004660964012146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1715047359466553, + "epoch": 2.17, + "learning_rate": 3.914623837700761e-05, + "loss": 0.8931, + "step": 2568, + "task_loss": 0.5798588991165161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8858317136764526, + "epoch": 2.17, + "learning_rate": 3.9142011834319534e-05, + "loss": 1.2267, + "step": 2569, + "task_loss": 0.7226612567901611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0393520593643188, + "epoch": 2.17, + "learning_rate": 3.913778529163145e-05, + "loss": 0.9466, + "step": 2570, + "task_loss": 0.22246521711349487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9375803470611572, + "epoch": 2.17, + "learning_rate": 3.913355874894337e-05, + "loss": 1.2883, + "step": 2571, + "task_loss": 0.997008740901947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.101149320602417, + "epoch": 2.17, + "learning_rate": 3.9129332206255286e-05, + "loss": 0.9537, + "step": 2572, + "task_loss": 1.194193959236145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6067936420440674, + "epoch": 2.17, + "learning_rate": 3.91251056635672e-05, + "loss": 0.8612, + "step": 2573, + "task_loss": 0.547789454460144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1956615447998047, + "epoch": 2.18, + "learning_rate": 3.912087912087912e-05, + "loss": 1.2692, + "step": 2574, + "task_loss": 1.2824865579605103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1936125755310059, + "epoch": 2.18, + "learning_rate": 3.9116652578191046e-05, + "loss": 1.0536, + "step": 2575, + "task_loss": 0.8501572012901306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1733366250991821, + "epoch": 2.18, + "learning_rate": 3.911242603550296e-05, + "loss": 1.263, + "step": 2576, + "task_loss": 1.5960862636566162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9040282368659973, + "epoch": 2.18, + "learning_rate": 3.910819949281488e-05, + "loss": 1.0244, + "step": 2577, + "task_loss": 0.7344114184379578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9368060827255249, + "epoch": 2.18, + "learning_rate": 3.91039729501268e-05, + "loss": 0.8905, + "step": 2578, + "task_loss": 1.2730581760406494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8898172378540039, + "epoch": 2.18, + "learning_rate": 3.909974640743871e-05, + "loss": 1.3308, + "step": 2579, + "task_loss": 1.2473318576812744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4191974401474, + "epoch": 2.18, + "learning_rate": 3.909551986475064e-05, + "loss": 1.3315, + "step": 2580, + "task_loss": 0.5722655057907104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0766643285751343, + "epoch": 2.18, + "learning_rate": 3.909129332206256e-05, + "loss": 0.9769, + "step": 2581, + "task_loss": 0.3471655249595642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.018371343612671, + "epoch": 2.18, + "learning_rate": 3.908706677937448e-05, + "loss": 0.9782, + "step": 2582, + "task_loss": 0.5412055850028992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2503197193145752, + "epoch": 2.18, + "learning_rate": 3.908284023668639e-05, + "loss": 0.871, + "step": 2583, + "task_loss": 1.1371228694915771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2818759679794312, + "epoch": 2.18, + "learning_rate": 3.907861369399831e-05, + "loss": 1.0969, + "step": 2584, + "task_loss": 1.6001414060592651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0746575593948364, + "epoch": 2.19, + "learning_rate": 3.907438715131023e-05, + "loss": 0.8931, + "step": 2585, + "task_loss": 0.4314166009426117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4091876745224, + "epoch": 2.19, + "learning_rate": 3.907016060862215e-05, + "loss": 1.0904, + "step": 2586, + "task_loss": 0.9434720873832703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6515042781829834, + "epoch": 2.19, + "learning_rate": 3.906593406593407e-05, + "loss": 0.9138, + "step": 2587, + "task_loss": 0.8693208694458008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.15598464012146, + "epoch": 2.19, + "learning_rate": 3.906170752324599e-05, + "loss": 1.0352, + "step": 2588, + "task_loss": 1.2237881422042847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9320387244224548, + "epoch": 2.19, + "learning_rate": 3.90574809805579e-05, + "loss": 0.9555, + "step": 2589, + "task_loss": 1.5926427841186523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3917443752288818, + "epoch": 2.19, + "learning_rate": 3.905325443786982e-05, + "loss": 1.4074, + "step": 2590, + "task_loss": 1.4200363159179688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2033021450042725, + "epoch": 2.19, + "learning_rate": 3.904902789518174e-05, + "loss": 0.9249, + "step": 2591, + "task_loss": 1.2091560363769531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0083342790603638, + "epoch": 2.19, + "learning_rate": 3.904480135249366e-05, + "loss": 0.9561, + "step": 2592, + "task_loss": 0.9786311984062195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7833329439163208, + "epoch": 2.19, + "learning_rate": 3.904057480980558e-05, + "loss": 0.587, + "step": 2593, + "task_loss": 0.47942662239074707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6736243963241577, + "epoch": 2.19, + "learning_rate": 3.90363482671175e-05, + "loss": 1.1556, + "step": 2594, + "task_loss": 1.3562278747558594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0056538581848145, + "epoch": 2.19, + "learning_rate": 3.903212172442942e-05, + "loss": 0.8437, + "step": 2595, + "task_loss": 1.1796281337738037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5997281074523926, + "epoch": 2.19, + "learning_rate": 3.902789518174133e-05, + "loss": 0.9662, + "step": 2596, + "task_loss": 1.2729130983352661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7086240649223328, + "epoch": 2.2, + "learning_rate": 3.902366863905326e-05, + "loss": 0.934, + "step": 2597, + "task_loss": 0.572425365447998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3163812160491943, + "epoch": 2.2, + "learning_rate": 3.901944209636518e-05, + "loss": 1.118, + "step": 2598, + "task_loss": 0.6805679202079773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2768546342849731, + "epoch": 2.2, + "learning_rate": 3.901521555367709e-05, + "loss": 1.0428, + "step": 2599, + "task_loss": 1.9813642501831055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8333299160003662, + "epoch": 2.2, + "learning_rate": 3.901098901098901e-05, + "loss": 0.7286, + "step": 2600, + "task_loss": 0.9594441056251526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6751474142074585, + "epoch": 2.2, + "learning_rate": 3.900676246830093e-05, + "loss": 1.0864, + "step": 2601, + "task_loss": 0.8831893801689148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2950180768966675, + "epoch": 2.2, + "learning_rate": 3.900253592561285e-05, + "loss": 1.134, + "step": 2602, + "task_loss": 0.8364415764808655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4203542470932007, + "epoch": 2.2, + "learning_rate": 3.899830938292477e-05, + "loss": 1.1994, + "step": 2603, + "task_loss": 1.4373823404312134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9169692993164062, + "epoch": 2.2, + "learning_rate": 3.899408284023669e-05, + "loss": 1.087, + "step": 2604, + "task_loss": 1.1711246967315674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4896659851074219, + "epoch": 2.2, + "learning_rate": 3.8989856297548604e-05, + "loss": 1.0821, + "step": 2605, + "task_loss": 0.6582728624343872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7764432430267334, + "epoch": 2.2, + "learning_rate": 3.8985629754860524e-05, + "loss": 0.8271, + "step": 2606, + "task_loss": 0.7289114594459534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2794256210327148, + "epoch": 2.2, + "learning_rate": 3.8981403212172443e-05, + "loss": 1.0386, + "step": 2607, + "task_loss": 0.867957592010498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4030076265335083, + "epoch": 2.2, + "learning_rate": 3.897717666948436e-05, + "loss": 1.3621, + "step": 2608, + "task_loss": 1.2202754020690918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9415581226348877, + "epoch": 2.21, + "learning_rate": 3.897295012679628e-05, + "loss": 1.2506, + "step": 2609, + "task_loss": 1.256840705871582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.242197036743164, + "epoch": 2.21, + "learning_rate": 3.89687235841082e-05, + "loss": 0.8058, + "step": 2610, + "task_loss": 0.2458876073360443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2898988723754883, + "epoch": 2.21, + "learning_rate": 3.896449704142012e-05, + "loss": 1.1893, + "step": 2611, + "task_loss": 2.9621353149414062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5680139064788818, + "epoch": 2.21, + "learning_rate": 3.8960270498732035e-05, + "loss": 1.2619, + "step": 2612, + "task_loss": 1.2702432870864868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1674429178237915, + "epoch": 2.21, + "learning_rate": 3.8956043956043955e-05, + "loss": 1.2152, + "step": 2613, + "task_loss": 0.5675352811813354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5425143241882324, + "epoch": 2.21, + "learning_rate": 3.895181741335588e-05, + "loss": 1.1851, + "step": 2614, + "task_loss": 0.25126349925994873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0818455219268799, + "epoch": 2.21, + "learning_rate": 3.8947590870667795e-05, + "loss": 0.9039, + "step": 2615, + "task_loss": 1.5480843782424927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9032998085021973, + "epoch": 2.21, + "learning_rate": 3.8943364327979714e-05, + "loss": 1.0538, + "step": 2616, + "task_loss": 1.5938329696655273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2177902460098267, + "epoch": 2.21, + "learning_rate": 3.8939137785291634e-05, + "loss": 1.2493, + "step": 2617, + "task_loss": 1.9349310398101807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1987149715423584, + "epoch": 2.21, + "learning_rate": 3.893491124260355e-05, + "loss": 1.0721, + "step": 2618, + "task_loss": 0.4543725550174713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0904655456542969, + "epoch": 2.21, + "learning_rate": 3.8930684699915474e-05, + "loss": 0.8865, + "step": 2619, + "task_loss": 1.2245304584503174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2970377206802368, + "epoch": 2.21, + "learning_rate": 3.892645815722739e-05, + "loss": 1.1067, + "step": 2620, + "task_loss": 1.9518994092941284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7272325754165649, + "epoch": 2.22, + "learning_rate": 3.8922231614539306e-05, + "loss": 0.9174, + "step": 2621, + "task_loss": 0.597977876663208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.22951340675354, + "epoch": 2.22, + "learning_rate": 3.8918005071851226e-05, + "loss": 0.8941, + "step": 2622, + "task_loss": 1.1773128509521484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3784542679786682, + "epoch": 2.22, + "learning_rate": 3.8913778529163146e-05, + "loss": 1.1644, + "step": 2623, + "task_loss": 0.5718308687210083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9610483050346375, + "epoch": 2.22, + "learning_rate": 3.8909551986475065e-05, + "loss": 1.2162, + "step": 2624, + "task_loss": 0.6378340125083923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4220547676086426, + "epoch": 2.22, + "learning_rate": 3.8905325443786985e-05, + "loss": 1.169, + "step": 2625, + "task_loss": 1.0365341901779175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9056533575057983, + "epoch": 2.22, + "learning_rate": 3.8901098901098905e-05, + "loss": 1.2776, + "step": 2626, + "task_loss": 1.1235480308532715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0311174392700195, + "epoch": 2.22, + "learning_rate": 3.8896872358410825e-05, + "loss": 0.9669, + "step": 2627, + "task_loss": 0.672943115234375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9686775803565979, + "epoch": 2.22, + "learning_rate": 3.889264581572274e-05, + "loss": 0.862, + "step": 2628, + "task_loss": 1.31742525100708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3354463577270508, + "epoch": 2.22, + "learning_rate": 3.888841927303466e-05, + "loss": 0.8191, + "step": 2629, + "task_loss": 1.3126877546310425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1685891151428223, + "epoch": 2.22, + "learning_rate": 3.888419273034658e-05, + "loss": 0.7217, + "step": 2630, + "task_loss": 0.5768212676048279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4074264764785767, + "epoch": 2.22, + "learning_rate": 3.88799661876585e-05, + "loss": 1.0111, + "step": 2631, + "task_loss": 0.9191560745239258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7551029920578003, + "epoch": 2.22, + "learning_rate": 3.8875739644970417e-05, + "loss": 0.8886, + "step": 2632, + "task_loss": 0.31499430537223816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5872465372085571, + "epoch": 2.23, + "learning_rate": 3.8871513102282336e-05, + "loss": 1.1038, + "step": 2633, + "task_loss": 0.927814245223999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1226297616958618, + "epoch": 2.23, + "learning_rate": 3.886728655959425e-05, + "loss": 1.1559, + "step": 2634, + "task_loss": 0.9361493587493896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9992572069168091, + "epoch": 2.23, + "learning_rate": 3.886306001690617e-05, + "loss": 0.9598, + "step": 2635, + "task_loss": 1.009710669517517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5561962127685547, + "epoch": 2.23, + "learning_rate": 3.8858833474218096e-05, + "loss": 0.9598, + "step": 2636, + "task_loss": 0.21820677816867828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0053800344467163, + "epoch": 2.23, + "learning_rate": 3.885460693153001e-05, + "loss": 1.123, + "step": 2637, + "task_loss": 0.43604540824890137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5035459995269775, + "epoch": 2.23, + "learning_rate": 3.885038038884193e-05, + "loss": 0.9778, + "step": 2638, + "task_loss": 0.06432268768548965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.282610535621643, + "epoch": 2.23, + "learning_rate": 3.884615384615385e-05, + "loss": 1.0077, + "step": 2639, + "task_loss": 0.8221561312675476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9048745036125183, + "epoch": 2.23, + "learning_rate": 3.884192730346577e-05, + "loss": 1.0226, + "step": 2640, + "task_loss": 1.1490238904953003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9234331846237183, + "epoch": 2.23, + "learning_rate": 3.883770076077769e-05, + "loss": 1.1303, + "step": 2641, + "task_loss": 2.1652305126190186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6388466358184814, + "epoch": 2.23, + "learning_rate": 3.883347421808961e-05, + "loss": 0.9597, + "step": 2642, + "task_loss": 1.337090015411377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8627198934555054, + "epoch": 2.23, + "learning_rate": 3.882924767540153e-05, + "loss": 0.8972, + "step": 2643, + "task_loss": 1.14003324508667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8966836929321289, + "epoch": 2.23, + "learning_rate": 3.882502113271344e-05, + "loss": 1.1171, + "step": 2644, + "task_loss": 0.6047772765159607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9539769887924194, + "epoch": 2.24, + "learning_rate": 3.882079459002536e-05, + "loss": 1.0067, + "step": 2645, + "task_loss": 1.5571770668029785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8053699135780334, + "epoch": 2.24, + "learning_rate": 3.881656804733728e-05, + "loss": 1.0571, + "step": 2646, + "task_loss": 0.4708690047264099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6779573559761047, + "epoch": 2.24, + "learning_rate": 3.88123415046492e-05, + "loss": 0.9754, + "step": 2647, + "task_loss": 0.3710586428642273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6937364339828491, + "epoch": 2.24, + "learning_rate": 3.880811496196112e-05, + "loss": 0.9141, + "step": 2648, + "task_loss": 0.5804961919784546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7161368131637573, + "epoch": 2.24, + "learning_rate": 3.880388841927304e-05, + "loss": 0.9676, + "step": 2649, + "task_loss": 1.038461446762085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4778826236724854, + "epoch": 2.24, + "learning_rate": 3.879966187658495e-05, + "loss": 1.0377, + "step": 2650, + "task_loss": 1.319225549697876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9156336784362793, + "epoch": 2.24, + "learning_rate": 3.879543533389687e-05, + "loss": 1.1271, + "step": 2651, + "task_loss": 0.8085601329803467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8943049311637878, + "epoch": 2.24, + "learning_rate": 3.879120879120879e-05, + "loss": 0.983, + "step": 2652, + "task_loss": 0.5118027925491333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9428730607032776, + "epoch": 2.24, + "learning_rate": 3.878698224852072e-05, + "loss": 0.8289, + "step": 2653, + "task_loss": 1.4936695098876953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0826247930526733, + "epoch": 2.24, + "learning_rate": 3.878275570583263e-05, + "loss": 1.1184, + "step": 2654, + "task_loss": 1.79459547996521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7982751727104187, + "epoch": 2.24, + "learning_rate": 3.877852916314455e-05, + "loss": 0.8941, + "step": 2655, + "task_loss": 0.6158744692802429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9716556072235107, + "epoch": 2.24, + "learning_rate": 3.877430262045647e-05, + "loss": 1.1512, + "step": 2656, + "task_loss": 1.2565444707870483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7928158640861511, + "epoch": 2.25, + "learning_rate": 3.877007607776838e-05, + "loss": 1.046, + "step": 2657, + "task_loss": 1.7453557252883911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9852041006088257, + "epoch": 2.25, + "learning_rate": 3.876584953508031e-05, + "loss": 1.0869, + "step": 2658, + "task_loss": 2.253538131713867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6725125312805176, + "epoch": 2.25, + "learning_rate": 3.876162299239223e-05, + "loss": 1.195, + "step": 2659, + "task_loss": 1.124234914779663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9993510246276855, + "epoch": 2.25, + "learning_rate": 3.875739644970414e-05, + "loss": 1.0778, + "step": 2660, + "task_loss": 0.8013314604759216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.948816180229187, + "epoch": 2.25, + "learning_rate": 3.875316990701606e-05, + "loss": 1.0504, + "step": 2661, + "task_loss": 0.5792979598045349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8851001262664795, + "epoch": 2.25, + "learning_rate": 3.874894336432798e-05, + "loss": 0.8263, + "step": 2662, + "task_loss": 1.1115994453430176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.814497709274292, + "epoch": 2.25, + "learning_rate": 3.87447168216399e-05, + "loss": 0.9557, + "step": 2663, + "task_loss": 1.1476298570632935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6598769426345825, + "epoch": 2.25, + "learning_rate": 3.874049027895182e-05, + "loss": 0.6931, + "step": 2664, + "task_loss": 0.7893716096878052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9944308996200562, + "epoch": 2.25, + "learning_rate": 3.873626373626374e-05, + "loss": 0.9943, + "step": 2665, + "task_loss": 1.0406333208084106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.307536005973816, + "epoch": 2.25, + "learning_rate": 3.8732037193575654e-05, + "loss": 0.9512, + "step": 2666, + "task_loss": 1.6902304887771606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2127697467803955, + "epoch": 2.25, + "learning_rate": 3.8727810650887574e-05, + "loss": 1.1893, + "step": 2667, + "task_loss": 0.6091403961181641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1961246728897095, + "epoch": 2.26, + "learning_rate": 3.872358410819949e-05, + "loss": 1.1924, + "step": 2668, + "task_loss": 1.3169676065444946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5935120582580566, + "epoch": 2.26, + "learning_rate": 3.871935756551141e-05, + "loss": 1.0087, + "step": 2669, + "task_loss": 1.368773102760315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4729667901992798, + "epoch": 2.26, + "learning_rate": 3.871513102282333e-05, + "loss": 0.7875, + "step": 2670, + "task_loss": 0.21085739135742188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.8610870838165283, + "epoch": 2.26, + "learning_rate": 3.871090448013525e-05, + "loss": 1.3971, + "step": 2671, + "task_loss": 1.9653570652008057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6469870805740356, + "epoch": 2.26, + "learning_rate": 3.870667793744717e-05, + "loss": 1.0285, + "step": 2672, + "task_loss": 1.5969148874282837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1149704456329346, + "epoch": 2.26, + "learning_rate": 3.8702451394759085e-05, + "loss": 0.8146, + "step": 2673, + "task_loss": 0.7556873559951782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1249204874038696, + "epoch": 2.26, + "learning_rate": 3.8698224852071005e-05, + "loss": 0.8217, + "step": 2674, + "task_loss": 1.1029245853424072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3365437984466553, + "epoch": 2.26, + "learning_rate": 3.869399830938293e-05, + "loss": 0.9237, + "step": 2675, + "task_loss": 0.46681517362594604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9106525182723999, + "epoch": 2.26, + "learning_rate": 3.8689771766694844e-05, + "loss": 1.2073, + "step": 2676, + "task_loss": 0.5507681369781494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3034907579421997, + "epoch": 2.26, + "learning_rate": 3.8685545224006764e-05, + "loss": 1.0074, + "step": 2677, + "task_loss": 0.78775954246521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7447793483734131, + "epoch": 2.26, + "learning_rate": 3.8681318681318684e-05, + "loss": 1.0321, + "step": 2678, + "task_loss": 0.6275745630264282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4194958209991455, + "epoch": 2.26, + "learning_rate": 3.86770921386306e-05, + "loss": 0.9003, + "step": 2679, + "task_loss": 0.18546538054943085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.224048137664795, + "epoch": 2.27, + "learning_rate": 3.867286559594252e-05, + "loss": 1.056, + "step": 2680, + "task_loss": 0.8362459540367126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8359700441360474, + "epoch": 2.27, + "learning_rate": 3.866863905325444e-05, + "loss": 0.8212, + "step": 2681, + "task_loss": 0.8788122534751892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8226076364517212, + "epoch": 2.27, + "learning_rate": 3.866441251056636e-05, + "loss": 1.0566, + "step": 2682, + "task_loss": 0.4080098867416382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.709947943687439, + "epoch": 2.27, + "learning_rate": 3.8660185967878276e-05, + "loss": 0.8677, + "step": 2683, + "task_loss": 0.6433559060096741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.8629521131515503, + "epoch": 2.27, + "learning_rate": 3.8655959425190196e-05, + "loss": 0.9927, + "step": 2684, + "task_loss": 0.8281159400939941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6922454237937927, + "epoch": 2.27, + "learning_rate": 3.8651732882502115e-05, + "loss": 0.6941, + "step": 2685, + "task_loss": 1.3576430082321167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8792062997817993, + "epoch": 2.27, + "learning_rate": 3.8647506339814035e-05, + "loss": 0.7034, + "step": 2686, + "task_loss": 0.6500823497772217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5596905946731567, + "epoch": 2.27, + "learning_rate": 3.8643279797125955e-05, + "loss": 1.2937, + "step": 2687, + "task_loss": 0.6133275032043457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1048552989959717, + "epoch": 2.27, + "learning_rate": 3.8639053254437874e-05, + "loss": 1.2472, + "step": 2688, + "task_loss": 0.7475387454032898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8646892309188843, + "epoch": 2.27, + "learning_rate": 3.863482671174979e-05, + "loss": 1.0369, + "step": 2689, + "task_loss": 0.9499238133430481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.389082908630371, + "epoch": 2.27, + "learning_rate": 3.863060016906171e-05, + "loss": 1.1229, + "step": 2690, + "task_loss": 1.7066051959991455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.062046766281128, + "epoch": 2.27, + "learning_rate": 3.862637362637363e-05, + "loss": 0.9872, + "step": 2691, + "task_loss": 1.264611840248108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.625060498714447, + "epoch": 2.28, + "learning_rate": 3.862214708368555e-05, + "loss": 0.7797, + "step": 2692, + "task_loss": 0.7085673809051514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.76900315284729, + "epoch": 2.28, + "learning_rate": 3.8617920540997466e-05, + "loss": 1.014, + "step": 2693, + "task_loss": 0.7663934826850891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8141089677810669, + "epoch": 2.28, + "learning_rate": 3.8613693998309386e-05, + "loss": 0.8539, + "step": 2694, + "task_loss": 1.802156925201416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8502638339996338, + "epoch": 2.28, + "learning_rate": 3.86094674556213e-05, + "loss": 0.892, + "step": 2695, + "task_loss": 1.2902977466583252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4483528733253479, + "epoch": 2.28, + "learning_rate": 3.860524091293322e-05, + "loss": 0.7045, + "step": 2696, + "task_loss": 0.539362370967865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8091086149215698, + "epoch": 2.28, + "learning_rate": 3.8601014370245145e-05, + "loss": 0.9149, + "step": 2697, + "task_loss": 0.44426122307777405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6857091188430786, + "epoch": 2.28, + "learning_rate": 3.8596787827557065e-05, + "loss": 0.9683, + "step": 2698, + "task_loss": 1.0794121026992798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.332659125328064, + "epoch": 2.28, + "learning_rate": 3.859256128486898e-05, + "loss": 1.1469, + "step": 2699, + "task_loss": 1.8514037132263184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8159573078155518, + "epoch": 2.28, + "learning_rate": 3.85883347421809e-05, + "loss": 0.9481, + "step": 2700, + "task_loss": 0.4771418571472168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7059981822967529, + "epoch": 2.28, + "learning_rate": 3.858410819949282e-05, + "loss": 0.7462, + "step": 2701, + "task_loss": 1.3292932510375977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7080508470535278, + "epoch": 2.28, + "learning_rate": 3.857988165680473e-05, + "loss": 0.7322, + "step": 2702, + "task_loss": 0.42925596237182617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8194615840911865, + "epoch": 2.28, + "learning_rate": 3.857565511411666e-05, + "loss": 0.7444, + "step": 2703, + "task_loss": 0.7658073902130127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3528589010238647, + "epoch": 2.29, + "learning_rate": 3.857142857142858e-05, + "loss": 1.0941, + "step": 2704, + "task_loss": 1.7712552547454834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8303042054176331, + "epoch": 2.29, + "learning_rate": 3.856720202874049e-05, + "loss": 1.0835, + "step": 2705, + "task_loss": 1.4435240030288696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7866034507751465, + "epoch": 2.29, + "learning_rate": 3.856297548605241e-05, + "loss": 0.8697, + "step": 2706, + "task_loss": 1.0457793474197388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9597992897033691, + "epoch": 2.29, + "learning_rate": 3.855874894336433e-05, + "loss": 1.1014, + "step": 2707, + "task_loss": 0.6245673894882202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8721973299980164, + "epoch": 2.29, + "learning_rate": 3.855452240067625e-05, + "loss": 0.7514, + "step": 2708, + "task_loss": 1.620715856552124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7276350259780884, + "epoch": 2.29, + "learning_rate": 3.855029585798817e-05, + "loss": 0.9331, + "step": 2709, + "task_loss": 1.8006185293197632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46855589747428894, + "epoch": 2.29, + "learning_rate": 3.854606931530009e-05, + "loss": 0.533, + "step": 2710, + "task_loss": 0.711432158946991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49448275566101074, + "epoch": 2.29, + "learning_rate": 3.854184277261201e-05, + "loss": 1.0711, + "step": 2711, + "task_loss": 0.33237916231155396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9341626167297363, + "epoch": 2.29, + "learning_rate": 3.853761622992392e-05, + "loss": 0.9384, + "step": 2712, + "task_loss": 0.8614869117736816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.186208724975586, + "epoch": 2.29, + "learning_rate": 3.853338968723584e-05, + "loss": 0.8829, + "step": 2713, + "task_loss": 0.6522613167762756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0377975702285767, + "epoch": 2.29, + "learning_rate": 3.852916314454777e-05, + "loss": 0.9292, + "step": 2714, + "task_loss": 0.5374699234962463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9838881492614746, + "epoch": 2.29, + "learning_rate": 3.852493660185968e-05, + "loss": 0.9744, + "step": 2715, + "task_loss": 1.5493029356002808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4743983745574951, + "epoch": 2.3, + "learning_rate": 3.85207100591716e-05, + "loss": 1.1343, + "step": 2716, + "task_loss": 2.1171298027038574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7049221992492676, + "epoch": 2.3, + "learning_rate": 3.851648351648352e-05, + "loss": 0.7853, + "step": 2717, + "task_loss": 0.9822770953178406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7248736619949341, + "epoch": 2.3, + "learning_rate": 3.851225697379543e-05, + "loss": 0.756, + "step": 2718, + "task_loss": 1.1020175218582153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2542095184326172, + "epoch": 2.3, + "learning_rate": 3.850803043110735e-05, + "loss": 1.1003, + "step": 2719, + "task_loss": 1.3991535902023315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6531649231910706, + "epoch": 2.3, + "learning_rate": 3.850380388841928e-05, + "loss": 0.9238, + "step": 2720, + "task_loss": 1.1276453733444214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7287998199462891, + "epoch": 2.3, + "learning_rate": 3.849957734573119e-05, + "loss": 0.7562, + "step": 2721, + "task_loss": 0.3502250015735626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1081236600875854, + "epoch": 2.3, + "learning_rate": 3.849535080304311e-05, + "loss": 0.7874, + "step": 2722, + "task_loss": 0.49908962845802307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6261146068572998, + "epoch": 2.3, + "learning_rate": 3.849112426035503e-05, + "loss": 0.8146, + "step": 2723, + "task_loss": 0.728874146938324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9258499145507812, + "epoch": 2.3, + "learning_rate": 3.8486897717666944e-05, + "loss": 0.885, + "step": 2724, + "task_loss": 0.6445170044898987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0069427490234375, + "epoch": 2.3, + "learning_rate": 3.848267117497887e-05, + "loss": 1.0715, + "step": 2725, + "task_loss": 0.8258471488952637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7842346429824829, + "epoch": 2.3, + "learning_rate": 3.847844463229079e-05, + "loss": 1.0863, + "step": 2726, + "task_loss": 0.7736535668373108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7150890827178955, + "epoch": 2.3, + "learning_rate": 3.847421808960271e-05, + "loss": 0.901, + "step": 2727, + "task_loss": 0.699732780456543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8901740312576294, + "epoch": 2.31, + "learning_rate": 3.846999154691462e-05, + "loss": 1.0364, + "step": 2728, + "task_loss": 0.9829272031784058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0776212215423584, + "epoch": 2.31, + "learning_rate": 3.846576500422654e-05, + "loss": 1.0861, + "step": 2729, + "task_loss": 2.727968692779541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2469274997711182, + "epoch": 2.31, + "learning_rate": 3.846153846153846e-05, + "loss": 0.9628, + "step": 2730, + "task_loss": 0.5393826961517334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4563027322292328, + "epoch": 2.31, + "learning_rate": 3.845731191885038e-05, + "loss": 0.7541, + "step": 2731, + "task_loss": 0.3857778310775757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0147790908813477, + "epoch": 2.31, + "learning_rate": 3.84530853761623e-05, + "loss": 1.0398, + "step": 2732, + "task_loss": 1.0130153894424438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9169516563415527, + "epoch": 2.31, + "learning_rate": 3.844885883347422e-05, + "loss": 0.9369, + "step": 2733, + "task_loss": 1.0595557689666748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5466210842132568, + "epoch": 2.31, + "learning_rate": 3.8444632290786135e-05, + "loss": 0.6902, + "step": 2734, + "task_loss": 0.8124677538871765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0851749181747437, + "epoch": 2.31, + "learning_rate": 3.8440405748098055e-05, + "loss": 1.0223, + "step": 2735, + "task_loss": 0.4945577383041382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9139431715011597, + "epoch": 2.31, + "learning_rate": 3.8436179205409975e-05, + "loss": 0.8949, + "step": 2736, + "task_loss": 0.329665869474411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6739212274551392, + "epoch": 2.31, + "learning_rate": 3.8431952662721894e-05, + "loss": 0.786, + "step": 2737, + "task_loss": 1.281335473060608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4547343254089355, + "epoch": 2.31, + "learning_rate": 3.8427726120033814e-05, + "loss": 1.0999, + "step": 2738, + "task_loss": 1.8744821548461914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6891152858734131, + "epoch": 2.32, + "learning_rate": 3.8423499577345734e-05, + "loss": 0.7368, + "step": 2739, + "task_loss": 0.7544235587120056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8011736869812012, + "epoch": 2.32, + "learning_rate": 3.8419273034657653e-05, + "loss": 0.9129, + "step": 2740, + "task_loss": 1.2579606771469116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9575057625770569, + "epoch": 2.32, + "learning_rate": 3.8415046491969566e-05, + "loss": 1.1874, + "step": 2741, + "task_loss": 0.6582731604576111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8653138875961304, + "epoch": 2.32, + "learning_rate": 3.841081994928149e-05, + "loss": 0.9631, + "step": 2742, + "task_loss": 0.9822165966033936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7020207047462463, + "epoch": 2.32, + "learning_rate": 3.840659340659341e-05, + "loss": 0.6931, + "step": 2743, + "task_loss": 2.2094030380249023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1200835704803467, + "epoch": 2.32, + "learning_rate": 3.8402366863905326e-05, + "loss": 0.8202, + "step": 2744, + "task_loss": 0.502842128276825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3473713397979736, + "epoch": 2.32, + "learning_rate": 3.8398140321217245e-05, + "loss": 0.8812, + "step": 2745, + "task_loss": 0.5459030866622925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9201637506484985, + "epoch": 2.32, + "learning_rate": 3.8393913778529165e-05, + "loss": 0.8083, + "step": 2746, + "task_loss": 0.5270050764083862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7367267608642578, + "epoch": 2.32, + "learning_rate": 3.8389687235841085e-05, + "loss": 0.8392, + "step": 2747, + "task_loss": 1.0417191982269287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.724015474319458, + "epoch": 2.32, + "learning_rate": 3.8385460693153005e-05, + "loss": 0.9245, + "step": 2748, + "task_loss": 0.8689031004905701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9509819746017456, + "epoch": 2.32, + "learning_rate": 3.8381234150464924e-05, + "loss": 0.8218, + "step": 2749, + "task_loss": 1.7260948419570923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6981604695320129, + "epoch": 2.32, + "learning_rate": 3.837700760777684e-05, + "loss": 0.8546, + "step": 2750, + "task_loss": 0.4585670232772827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8247796297073364, + "epoch": 2.33, + "learning_rate": 3.837278106508876e-05, + "loss": 1.0048, + "step": 2751, + "task_loss": 1.0435298681259155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9197900891304016, + "epoch": 2.33, + "learning_rate": 3.836855452240068e-05, + "loss": 1.1241, + "step": 2752, + "task_loss": 0.8211560249328613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4762333631515503, + "epoch": 2.33, + "learning_rate": 3.8364327979712597e-05, + "loss": 0.6653, + "step": 2753, + "task_loss": 0.5741895437240601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.156245231628418, + "epoch": 2.33, + "learning_rate": 3.8360101437024516e-05, + "loss": 1.103, + "step": 2754, + "task_loss": 1.2082412242889404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3442273139953613, + "epoch": 2.33, + "learning_rate": 3.8355874894336436e-05, + "loss": 1.1644, + "step": 2755, + "task_loss": 0.9181085228919983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8421118259429932, + "epoch": 2.33, + "learning_rate": 3.8351648351648356e-05, + "loss": 0.8537, + "step": 2756, + "task_loss": 0.7524453997612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0632362365722656, + "epoch": 2.33, + "learning_rate": 3.834742180896027e-05, + "loss": 1.0169, + "step": 2757, + "task_loss": 1.1791553497314453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8661927580833435, + "epoch": 2.33, + "learning_rate": 3.834319526627219e-05, + "loss": 0.8457, + "step": 2758, + "task_loss": 0.5272379517555237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7913994789123535, + "epoch": 2.33, + "learning_rate": 3.8338968723584115e-05, + "loss": 1.1334, + "step": 2759, + "task_loss": 2.080298900604248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7201851010322571, + "epoch": 2.33, + "learning_rate": 3.833474218089603e-05, + "loss": 0.8741, + "step": 2760, + "task_loss": 0.9891384243965149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7822756767272949, + "epoch": 2.33, + "learning_rate": 3.833051563820795e-05, + "loss": 0.8047, + "step": 2761, + "task_loss": 0.6375095248222351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.438657283782959, + "epoch": 2.33, + "learning_rate": 3.832628909551987e-05, + "loss": 1.015, + "step": 2762, + "task_loss": 1.7634660005569458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1297513246536255, + "epoch": 2.34, + "learning_rate": 3.832206255283178e-05, + "loss": 0.8486, + "step": 2763, + "task_loss": 1.720613718032837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0095951557159424, + "epoch": 2.34, + "learning_rate": 3.831783601014371e-05, + "loss": 0.7871, + "step": 2764, + "task_loss": 1.1953233480453491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0161008834838867, + "epoch": 2.34, + "learning_rate": 3.8313609467455627e-05, + "loss": 0.7758, + "step": 2765, + "task_loss": 0.6029589176177979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6541587114334106, + "epoch": 2.34, + "learning_rate": 3.830938292476754e-05, + "loss": 0.7323, + "step": 2766, + "task_loss": 1.05960214138031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5586663484573364, + "epoch": 2.34, + "learning_rate": 3.830515638207946e-05, + "loss": 0.9495, + "step": 2767, + "task_loss": 0.4589826166629791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9883490800857544, + "epoch": 2.34, + "learning_rate": 3.830092983939138e-05, + "loss": 1.034, + "step": 2768, + "task_loss": 0.4439418613910675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7511551380157471, + "epoch": 2.34, + "learning_rate": 3.82967032967033e-05, + "loss": 0.7189, + "step": 2769, + "task_loss": 0.3807438015937805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1827082633972168, + "epoch": 2.34, + "learning_rate": 3.829247675401522e-05, + "loss": 0.8532, + "step": 2770, + "task_loss": 1.0165820121765137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3562498092651367, + "epoch": 2.34, + "learning_rate": 3.828825021132714e-05, + "loss": 1.0029, + "step": 2771, + "task_loss": 0.637798547744751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8920499086380005, + "epoch": 2.34, + "learning_rate": 3.828402366863906e-05, + "loss": 0.9119, + "step": 2772, + "task_loss": 1.7400342226028442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4420353174209595, + "epoch": 2.34, + "learning_rate": 3.827979712595097e-05, + "loss": 0.9025, + "step": 2773, + "task_loss": 0.5119230151176453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8004299402236938, + "epoch": 2.34, + "learning_rate": 3.827557058326289e-05, + "loss": 0.7786, + "step": 2774, + "task_loss": 0.563220739364624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0306073427200317, + "epoch": 2.35, + "learning_rate": 3.827134404057481e-05, + "loss": 0.9229, + "step": 2775, + "task_loss": 0.6508654356002808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7608354091644287, + "epoch": 2.35, + "learning_rate": 3.826711749788673e-05, + "loss": 0.8958, + "step": 2776, + "task_loss": 0.968728244304657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7283639311790466, + "epoch": 2.35, + "learning_rate": 3.826289095519865e-05, + "loss": 0.734, + "step": 2777, + "task_loss": 0.6921197772026062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5912861824035645, + "epoch": 2.35, + "learning_rate": 3.825866441251057e-05, + "loss": 0.7005, + "step": 2778, + "task_loss": 0.24096111953258514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6926796436309814, + "epoch": 2.35, + "learning_rate": 3.825443786982248e-05, + "loss": 0.9106, + "step": 2779, + "task_loss": 1.185141921043396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0011357069015503, + "epoch": 2.35, + "learning_rate": 3.82502113271344e-05, + "loss": 1.1086, + "step": 2780, + "task_loss": 0.8280209898948669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8165407180786133, + "epoch": 2.35, + "learning_rate": 3.824598478444633e-05, + "loss": 0.9512, + "step": 2781, + "task_loss": 1.393274188041687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2185384035110474, + "epoch": 2.35, + "learning_rate": 3.824175824175824e-05, + "loss": 0.9892, + "step": 2782, + "task_loss": 1.0361679792404175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8662289977073669, + "epoch": 2.35, + "learning_rate": 3.823753169907016e-05, + "loss": 0.9756, + "step": 2783, + "task_loss": 1.1324551105499268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7349108457565308, + "epoch": 2.35, + "learning_rate": 3.823330515638208e-05, + "loss": 0.5927, + "step": 2784, + "task_loss": 0.7382093071937561 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0079318284988403, + "epoch": 2.35, + "learning_rate": 3.8229078613694e-05, + "loss": 1.2044, + "step": 2785, + "task_loss": 0.6155231595039368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0645049810409546, + "epoch": 2.35, + "learning_rate": 3.822485207100592e-05, + "loss": 1.2562, + "step": 2786, + "task_loss": 1.8873682022094727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.124566674232483, + "epoch": 2.36, + "learning_rate": 3.822062552831784e-05, + "loss": 1.1087, + "step": 2787, + "task_loss": 1.3697583675384521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1162587404251099, + "epoch": 2.36, + "learning_rate": 3.821639898562976e-05, + "loss": 0.9679, + "step": 2788, + "task_loss": 1.2720139026641846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8952822685241699, + "epoch": 2.36, + "learning_rate": 3.821217244294167e-05, + "loss": 1.0339, + "step": 2789, + "task_loss": 0.8598231673240662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.898161768913269, + "epoch": 2.36, + "learning_rate": 3.820794590025359e-05, + "loss": 1.0083, + "step": 2790, + "task_loss": 0.5518762469291687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8672978281974792, + "epoch": 2.36, + "learning_rate": 3.820371935756551e-05, + "loss": 0.8177, + "step": 2791, + "task_loss": 0.7514976263046265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1426260471343994, + "epoch": 2.36, + "learning_rate": 3.819949281487743e-05, + "loss": 1.025, + "step": 2792, + "task_loss": 0.9490423798561096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2920037508010864, + "epoch": 2.36, + "learning_rate": 3.819526627218935e-05, + "loss": 0.8818, + "step": 2793, + "task_loss": 1.3418645858764648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.809194803237915, + "epoch": 2.36, + "learning_rate": 3.819103972950127e-05, + "loss": 1.1807, + "step": 2794, + "task_loss": 0.8744732141494751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4683275818824768, + "epoch": 2.36, + "learning_rate": 3.8186813186813185e-05, + "loss": 0.9883, + "step": 2795, + "task_loss": 0.46207037568092346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.8712544441223145, + "epoch": 2.36, + "learning_rate": 3.8182586644125105e-05, + "loss": 1.1729, + "step": 2796, + "task_loss": 1.0861806869506836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8199642896652222, + "epoch": 2.36, + "learning_rate": 3.8178360101437024e-05, + "loss": 0.8701, + "step": 2797, + "task_loss": 0.8214704990386963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7675526738166809, + "epoch": 2.36, + "learning_rate": 3.817413355874895e-05, + "loss": 0.7958, + "step": 2798, + "task_loss": 0.3009050488471985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9251284599304199, + "epoch": 2.37, + "learning_rate": 3.8169907016060864e-05, + "loss": 0.9854, + "step": 2799, + "task_loss": 1.2297422885894775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3322408199310303, + "epoch": 2.37, + "learning_rate": 3.8165680473372784e-05, + "loss": 1.1804, + "step": 2800, + "task_loss": 0.9608936905860901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8078939914703369, + "epoch": 2.37, + "learning_rate": 3.81614539306847e-05, + "loss": 0.8534, + "step": 2801, + "task_loss": 0.6127941012382507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4568567276000977, + "epoch": 2.37, + "learning_rate": 3.8157227387996616e-05, + "loss": 0.9776, + "step": 2802, + "task_loss": 1.3818163871765137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7733312249183655, + "epoch": 2.37, + "learning_rate": 3.815300084530854e-05, + "loss": 0.8674, + "step": 2803, + "task_loss": 0.9998605251312256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9046659469604492, + "epoch": 2.37, + "learning_rate": 3.814877430262046e-05, + "loss": 0.9599, + "step": 2804, + "task_loss": 0.7956311702728271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0796973705291748, + "epoch": 2.37, + "learning_rate": 3.8144547759932375e-05, + "loss": 0.9899, + "step": 2805, + "task_loss": 1.750176191329956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.581706166267395, + "epoch": 2.37, + "learning_rate": 3.8140321217244295e-05, + "loss": 0.9691, + "step": 2806, + "task_loss": 0.6713460683822632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9355568289756775, + "epoch": 2.37, + "learning_rate": 3.8136094674556215e-05, + "loss": 0.983, + "step": 2807, + "task_loss": 0.8749027848243713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1323107481002808, + "epoch": 2.37, + "learning_rate": 3.8131868131868135e-05, + "loss": 0.8204, + "step": 2808, + "task_loss": 1.021223783493042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9036589860916138, + "epoch": 2.37, + "learning_rate": 3.8127641589180054e-05, + "loss": 0.9869, + "step": 2809, + "task_loss": 0.6791079640388489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9798613786697388, + "epoch": 2.38, + "learning_rate": 3.8123415046491974e-05, + "loss": 0.9178, + "step": 2810, + "task_loss": 1.2401801347732544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9742584824562073, + "epoch": 2.38, + "learning_rate": 3.811918850380389e-05, + "loss": 1.1287, + "step": 2811, + "task_loss": 0.8988955616950989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2078886032104492, + "epoch": 2.38, + "learning_rate": 3.811496196111581e-05, + "loss": 0.9508, + "step": 2812, + "task_loss": 0.9693688154220581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.503617525100708, + "epoch": 2.38, + "learning_rate": 3.811073541842773e-05, + "loss": 0.9074, + "step": 2813, + "task_loss": 1.4440945386886597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5045994520187378, + "epoch": 2.38, + "learning_rate": 3.8106508875739646e-05, + "loss": 0.8363, + "step": 2814, + "task_loss": 0.3252667188644409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7501572370529175, + "epoch": 2.38, + "learning_rate": 3.8102282333051566e-05, + "loss": 0.9259, + "step": 2815, + "task_loss": 0.5465265512466431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5216963291168213, + "epoch": 2.38, + "learning_rate": 3.8098055790363486e-05, + "loss": 1.1598, + "step": 2816, + "task_loss": 1.1024634838104248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1000169515609741, + "epoch": 2.38, + "learning_rate": 3.8093829247675406e-05, + "loss": 0.7246, + "step": 2817, + "task_loss": 0.9394451379776001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5417129993438721, + "epoch": 2.38, + "learning_rate": 3.808960270498732e-05, + "loss": 0.7452, + "step": 2818, + "task_loss": 0.20770999789237976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5670961737632751, + "epoch": 2.38, + "learning_rate": 3.808537616229924e-05, + "loss": 0.7114, + "step": 2819, + "task_loss": 0.7518359422683716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8874340057373047, + "epoch": 2.38, + "learning_rate": 3.8081149619611165e-05, + "loss": 0.7689, + "step": 2820, + "task_loss": 1.8740922212600708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8931160569190979, + "epoch": 2.38, + "learning_rate": 3.807692307692308e-05, + "loss": 0.8219, + "step": 2821, + "task_loss": 0.38368308544158936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.794457733631134, + "epoch": 2.39, + "learning_rate": 3.8072696534235e-05, + "loss": 0.9938, + "step": 2822, + "task_loss": 0.8908917903900146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8437801599502563, + "epoch": 2.39, + "learning_rate": 3.806846999154692e-05, + "loss": 1.1097, + "step": 2823, + "task_loss": 1.163989543914795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8442609310150146, + "epoch": 2.39, + "learning_rate": 3.806424344885883e-05, + "loss": 0.7882, + "step": 2824, + "task_loss": 0.5577024817466736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7913405895233154, + "epoch": 2.39, + "learning_rate": 3.806001690617076e-05, + "loss": 1.2467, + "step": 2825, + "task_loss": 0.7866659164428711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.442245602607727, + "epoch": 2.39, + "learning_rate": 3.8055790363482676e-05, + "loss": 0.8563, + "step": 2826, + "task_loss": 1.3980008363723755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2754932641983032, + "epoch": 2.39, + "learning_rate": 3.8051563820794596e-05, + "loss": 0.9085, + "step": 2827, + "task_loss": 0.8194046020507812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6932305097579956, + "epoch": 2.39, + "learning_rate": 3.804733727810651e-05, + "loss": 0.7479, + "step": 2828, + "task_loss": 0.5999937653541565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.947229266166687, + "epoch": 2.39, + "learning_rate": 3.804311073541843e-05, + "loss": 0.9813, + "step": 2829, + "task_loss": 1.659379243850708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2015345096588135, + "epoch": 2.39, + "learning_rate": 3.803888419273035e-05, + "loss": 1.0021, + "step": 2830, + "task_loss": 0.7031340003013611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0736157894134521, + "epoch": 2.39, + "learning_rate": 3.803465765004227e-05, + "loss": 0.8181, + "step": 2831, + "task_loss": 0.5764914155006409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9041352272033691, + "epoch": 2.39, + "learning_rate": 3.803043110735419e-05, + "loss": 0.9886, + "step": 2832, + "task_loss": 0.5194782018661499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1091835498809814, + "epoch": 2.39, + "learning_rate": 3.802620456466611e-05, + "loss": 0.7039, + "step": 2833, + "task_loss": 1.4340245723724365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7108336687088013, + "epoch": 2.4, + "learning_rate": 3.802197802197802e-05, + "loss": 0.7947, + "step": 2834, + "task_loss": 0.9190305471420288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.643272876739502, + "epoch": 2.4, + "learning_rate": 3.801775147928994e-05, + "loss": 0.8723, + "step": 2835, + "task_loss": 0.7608177661895752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1798882484436035, + "epoch": 2.4, + "learning_rate": 3.801352493660186e-05, + "loss": 0.8782, + "step": 2836, + "task_loss": 1.4496816396713257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8791857361793518, + "epoch": 2.4, + "learning_rate": 3.800929839391378e-05, + "loss": 0.816, + "step": 2837, + "task_loss": 0.7793219685554504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.612616777420044, + "epoch": 2.4, + "learning_rate": 3.80050718512257e-05, + "loss": 1.0627, + "step": 2838, + "task_loss": 0.5208151936531067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6477140784263611, + "epoch": 2.4, + "learning_rate": 3.800084530853762e-05, + "loss": 0.7009, + "step": 2839, + "task_loss": 0.7624218463897705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9690272808074951, + "epoch": 2.4, + "learning_rate": 3.799661876584953e-05, + "loss": 1.0545, + "step": 2840, + "task_loss": 1.3269321918487549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6555296182632446, + "epoch": 2.4, + "learning_rate": 3.799239222316145e-05, + "loss": 0.8966, + "step": 2841, + "task_loss": 1.1732136011123657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1967496871948242, + "epoch": 2.4, + "learning_rate": 3.798816568047338e-05, + "loss": 1.0562, + "step": 2842, + "task_loss": 0.7597159147262573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6872501373291016, + "epoch": 2.4, + "learning_rate": 3.79839391377853e-05, + "loss": 0.9537, + "step": 2843, + "task_loss": 0.5742092728614807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.521113932132721, + "epoch": 2.4, + "learning_rate": 3.797971259509721e-05, + "loss": 0.8332, + "step": 2844, + "task_loss": 0.5091487169265747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8770514726638794, + "epoch": 2.4, + "learning_rate": 3.797548605240913e-05, + "loss": 0.8272, + "step": 2845, + "task_loss": 0.4415215253829956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37679287791252136, + "epoch": 2.41, + "learning_rate": 3.797125950972105e-05, + "loss": 0.8942, + "step": 2846, + "task_loss": 0.3382928669452667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6787087917327881, + "epoch": 2.41, + "learning_rate": 3.7967032967032964e-05, + "loss": 0.8057, + "step": 2847, + "task_loss": 0.8066645264625549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4253851175308228, + "epoch": 2.41, + "learning_rate": 3.796280642434489e-05, + "loss": 1.0801, + "step": 2848, + "task_loss": 0.9736409187316895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1561572551727295, + "epoch": 2.41, + "learning_rate": 3.795857988165681e-05, + "loss": 1.1471, + "step": 2849, + "task_loss": 0.3738325834274292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8552138805389404, + "epoch": 2.41, + "learning_rate": 3.795435333896872e-05, + "loss": 0.8915, + "step": 2850, + "task_loss": 0.5768214464187622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6983251571655273, + "epoch": 2.41, + "learning_rate": 3.795012679628064e-05, + "loss": 0.6199, + "step": 2851, + "task_loss": 0.4596404731273651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7657592296600342, + "epoch": 2.41, + "learning_rate": 3.794590025359256e-05, + "loss": 0.8677, + "step": 2852, + "task_loss": 1.4275768995285034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8488539457321167, + "epoch": 2.41, + "learning_rate": 3.794167371090448e-05, + "loss": 0.8849, + "step": 2853, + "task_loss": 0.08751793950796127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8718726634979248, + "epoch": 2.41, + "learning_rate": 3.79374471682164e-05, + "loss": 0.8725, + "step": 2854, + "task_loss": 0.6644324660301208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0168962478637695, + "epoch": 2.41, + "learning_rate": 3.793322062552832e-05, + "loss": 0.8824, + "step": 2855, + "task_loss": 1.6163610219955444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5451105237007141, + "epoch": 2.41, + "learning_rate": 3.792899408284024e-05, + "loss": 0.8378, + "step": 2856, + "task_loss": 1.109480857849121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6393899917602539, + "epoch": 2.41, + "learning_rate": 3.7924767540152154e-05, + "loss": 0.7904, + "step": 2857, + "task_loss": 1.615539789199829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9568266868591309, + "epoch": 2.42, + "learning_rate": 3.7920540997464074e-05, + "loss": 0.8148, + "step": 2858, + "task_loss": 1.2032114267349243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9986792802810669, + "epoch": 2.42, + "learning_rate": 3.7916314454776e-05, + "loss": 0.8691, + "step": 2859, + "task_loss": 0.6073633432388306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46091777086257935, + "epoch": 2.42, + "learning_rate": 3.7912087912087914e-05, + "loss": 0.7717, + "step": 2860, + "task_loss": 0.6288771033287048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7875893115997314, + "epoch": 2.42, + "learning_rate": 3.7907861369399833e-05, + "loss": 0.7434, + "step": 2861, + "task_loss": 1.2561161518096924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7107213139533997, + "epoch": 2.42, + "learning_rate": 3.790363482671175e-05, + "loss": 0.8734, + "step": 2862, + "task_loss": 0.5317698121070862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7418437004089355, + "epoch": 2.42, + "learning_rate": 3.7899408284023666e-05, + "loss": 1.0113, + "step": 2863, + "task_loss": 1.2537049055099487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.535418689250946, + "epoch": 2.42, + "learning_rate": 3.7895181741335586e-05, + "loss": 0.531, + "step": 2864, + "task_loss": 0.5316053032875061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7312465906143188, + "epoch": 2.42, + "learning_rate": 3.789095519864751e-05, + "loss": 0.8328, + "step": 2865, + "task_loss": 0.1997838169336319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3078303337097168, + "epoch": 2.42, + "learning_rate": 3.7886728655959425e-05, + "loss": 1.1117, + "step": 2866, + "task_loss": 1.3781254291534424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47803449630737305, + "epoch": 2.42, + "learning_rate": 3.7882502113271345e-05, + "loss": 0.8111, + "step": 2867, + "task_loss": 0.13380055129528046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8989896774291992, + "epoch": 2.42, + "learning_rate": 3.7878275570583265e-05, + "loss": 1.049, + "step": 2868, + "task_loss": 0.9677106142044067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5937565565109253, + "epoch": 2.42, + "learning_rate": 3.787404902789518e-05, + "loss": 0.9118, + "step": 2869, + "task_loss": 0.35147061944007874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5706838369369507, + "epoch": 2.43, + "learning_rate": 3.7869822485207104e-05, + "loss": 1.0731, + "step": 2870, + "task_loss": 0.7251583337783813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9305149912834167, + "epoch": 2.43, + "learning_rate": 3.7865595942519024e-05, + "loss": 0.9065, + "step": 2871, + "task_loss": 1.2101788520812988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7045131921768188, + "epoch": 2.43, + "learning_rate": 3.7861369399830944e-05, + "loss": 1.0504, + "step": 2872, + "task_loss": 1.1862872838974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44650229811668396, + "epoch": 2.43, + "learning_rate": 3.785714285714286e-05, + "loss": 0.5797, + "step": 2873, + "task_loss": 0.6658046245574951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4747769832611084, + "epoch": 2.43, + "learning_rate": 3.7852916314454776e-05, + "loss": 0.8405, + "step": 2874, + "task_loss": 0.29629307985305786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0143952369689941, + "epoch": 2.43, + "learning_rate": 3.7848689771766696e-05, + "loss": 1.1113, + "step": 2875, + "task_loss": 2.073786735534668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6739494204521179, + "epoch": 2.43, + "learning_rate": 3.7844463229078616e-05, + "loss": 0.9615, + "step": 2876, + "task_loss": 0.9683629870414734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.92393958568573, + "epoch": 2.43, + "learning_rate": 3.7840236686390536e-05, + "loss": 0.8059, + "step": 2877, + "task_loss": 0.9110194444656372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9442762136459351, + "epoch": 2.43, + "learning_rate": 3.7836010143702455e-05, + "loss": 1.1131, + "step": 2878, + "task_loss": 1.1371574401855469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0322346687316895, + "epoch": 2.43, + "learning_rate": 3.783178360101437e-05, + "loss": 0.8629, + "step": 2879, + "task_loss": 0.4763219356536865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.060889720916748, + "epoch": 2.43, + "learning_rate": 3.782755705832629e-05, + "loss": 0.8096, + "step": 2880, + "task_loss": 0.9987779259681702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4273499250411987, + "epoch": 2.44, + "learning_rate": 3.782333051563821e-05, + "loss": 1.0835, + "step": 2881, + "task_loss": 0.9675452709197998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.128800392150879, + "epoch": 2.44, + "learning_rate": 3.781910397295013e-05, + "loss": 0.9938, + "step": 2882, + "task_loss": 0.7573325037956238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6825935244560242, + "epoch": 2.44, + "learning_rate": 3.781487743026205e-05, + "loss": 0.6113, + "step": 2883, + "task_loss": 0.5623762607574463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5015822649002075, + "epoch": 2.44, + "learning_rate": 3.781065088757397e-05, + "loss": 0.7364, + "step": 2884, + "task_loss": 0.18396219611167908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1744149923324585, + "epoch": 2.44, + "learning_rate": 3.780642434488589e-05, + "loss": 0.869, + "step": 2885, + "task_loss": 1.084243655204773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7606294751167297, + "epoch": 2.44, + "learning_rate": 3.78021978021978e-05, + "loss": 1.1993, + "step": 2886, + "task_loss": 0.7473613023757935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3170629143714905, + "epoch": 2.44, + "learning_rate": 3.7797971259509726e-05, + "loss": 0.7858, + "step": 2887, + "task_loss": 0.508811891078949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4943113327026367, + "epoch": 2.44, + "learning_rate": 3.7793744716821646e-05, + "loss": 0.7492, + "step": 2888, + "task_loss": 0.7180906534194946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7753883600234985, + "epoch": 2.44, + "learning_rate": 3.778951817413356e-05, + "loss": 0.9868, + "step": 2889, + "task_loss": 0.4151588976383209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7795724272727966, + "epoch": 2.44, + "learning_rate": 3.778529163144548e-05, + "loss": 0.8007, + "step": 2890, + "task_loss": 0.28128159046173096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6224783658981323, + "epoch": 2.44, + "learning_rate": 3.77810650887574e-05, + "loss": 0.5659, + "step": 2891, + "task_loss": 0.167165145277977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7167070508003235, + "epoch": 2.44, + "learning_rate": 3.777683854606932e-05, + "loss": 1.0862, + "step": 2892, + "task_loss": 0.43149879574775696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5128650665283203, + "epoch": 2.45, + "learning_rate": 3.777261200338124e-05, + "loss": 0.6924, + "step": 2893, + "task_loss": 0.4081245958805084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5909888744354248, + "epoch": 2.45, + "learning_rate": 3.776838546069316e-05, + "loss": 0.5409, + "step": 2894, + "task_loss": 0.878649115562439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.476996898651123, + "epoch": 2.45, + "learning_rate": 3.776415891800507e-05, + "loss": 0.9689, + "step": 2895, + "task_loss": 0.6604695320129395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5343919396400452, + "epoch": 2.45, + "learning_rate": 3.775993237531699e-05, + "loss": 0.7259, + "step": 2896, + "task_loss": 0.04899701476097107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4607908725738525, + "epoch": 2.45, + "learning_rate": 3.775570583262891e-05, + "loss": 1.0346, + "step": 2897, + "task_loss": 0.9924553632736206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7702344655990601, + "epoch": 2.45, + "learning_rate": 3.775147928994083e-05, + "loss": 0.9853, + "step": 2898, + "task_loss": 0.5657379031181335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9297490119934082, + "epoch": 2.45, + "learning_rate": 3.774725274725275e-05, + "loss": 0.7764, + "step": 2899, + "task_loss": 0.8503624796867371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6266500949859619, + "epoch": 2.45, + "learning_rate": 3.774302620456467e-05, + "loss": 0.9686, + "step": 2900, + "task_loss": 0.5785765051841736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.264108419418335, + "epoch": 2.45, + "learning_rate": 3.773879966187659e-05, + "loss": 0.9798, + "step": 2901, + "task_loss": 1.1112068891525269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6151860356330872, + "epoch": 2.45, + "learning_rate": 3.77345731191885e-05, + "loss": 0.7393, + "step": 2902, + "task_loss": 0.40300264954566956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8651176691055298, + "epoch": 2.45, + "learning_rate": 3.773034657650042e-05, + "loss": 0.7074, + "step": 2903, + "task_loss": 0.4849865436553955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7949943542480469, + "epoch": 2.45, + "learning_rate": 3.772612003381235e-05, + "loss": 0.9134, + "step": 2904, + "task_loss": 0.6825757622718811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.101493239402771, + "epoch": 2.46, + "learning_rate": 3.772189349112426e-05, + "loss": 0.8243, + "step": 2905, + "task_loss": 1.1920785903930664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2680885791778564, + "epoch": 2.46, + "learning_rate": 3.771766694843618e-05, + "loss": 1.0018, + "step": 2906, + "task_loss": 1.1242799758911133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7173442244529724, + "epoch": 2.46, + "learning_rate": 3.77134404057481e-05, + "loss": 0.8033, + "step": 2907, + "task_loss": 1.3211387395858765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6088575124740601, + "epoch": 2.46, + "learning_rate": 3.7709213863060014e-05, + "loss": 0.6901, + "step": 2908, + "task_loss": 0.5301983952522278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9267017841339111, + "epoch": 2.46, + "learning_rate": 3.770498732037194e-05, + "loss": 1.0039, + "step": 2909, + "task_loss": 1.302922248840332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8784389495849609, + "epoch": 2.46, + "learning_rate": 3.770076077768386e-05, + "loss": 0.9265, + "step": 2910, + "task_loss": 1.1465840339660645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5505959987640381, + "epoch": 2.46, + "learning_rate": 3.769653423499577e-05, + "loss": 0.7087, + "step": 2911, + "task_loss": 0.3177320957183838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8098050355911255, + "epoch": 2.46, + "learning_rate": 3.769230769230769e-05, + "loss": 0.85, + "step": 2912, + "task_loss": 0.668305516242981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3092848062515259, + "epoch": 2.46, + "learning_rate": 3.768808114961961e-05, + "loss": 0.8966, + "step": 2913, + "task_loss": 0.5536526441574097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6809405088424683, + "epoch": 2.46, + "learning_rate": 3.768385460693153e-05, + "loss": 0.8047, + "step": 2914, + "task_loss": 0.7208324074745178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.644074022769928, + "epoch": 2.46, + "learning_rate": 3.767962806424345e-05, + "loss": 0.7074, + "step": 2915, + "task_loss": 0.5879451632499695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9256362915039062, + "epoch": 2.46, + "learning_rate": 3.767540152155537e-05, + "loss": 0.9838, + "step": 2916, + "task_loss": 1.6519067287445068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5502112507820129, + "epoch": 2.47, + "learning_rate": 3.767117497886729e-05, + "loss": 0.8988, + "step": 2917, + "task_loss": 0.35601359605789185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5163922905921936, + "epoch": 2.47, + "learning_rate": 3.7666948436179204e-05, + "loss": 0.7384, + "step": 2918, + "task_loss": 0.16870497167110443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6808202266693115, + "epoch": 2.47, + "learning_rate": 3.7662721893491124e-05, + "loss": 0.8741, + "step": 2919, + "task_loss": 0.1618366837501526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7896201610565186, + "epoch": 2.47, + "learning_rate": 3.7658495350803044e-05, + "loss": 0.8297, + "step": 2920, + "task_loss": 0.8454868793487549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5500733852386475, + "epoch": 2.47, + "learning_rate": 3.7654268808114964e-05, + "loss": 0.9446, + "step": 2921, + "task_loss": 0.7291027903556824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.682433009147644, + "epoch": 2.47, + "learning_rate": 3.765004226542688e-05, + "loss": 1.0764, + "step": 2922, + "task_loss": 1.1330301761627197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8980833292007446, + "epoch": 2.47, + "learning_rate": 3.76458157227388e-05, + "loss": 0.812, + "step": 2923, + "task_loss": 0.4154594838619232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6040636301040649, + "epoch": 2.47, + "learning_rate": 3.7641589180050716e-05, + "loss": 0.962, + "step": 2924, + "task_loss": 1.673122525215149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5187690854072571, + "epoch": 2.47, + "learning_rate": 3.7637362637362636e-05, + "loss": 0.7238, + "step": 2925, + "task_loss": 0.8250047564506531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8777328133583069, + "epoch": 2.47, + "learning_rate": 3.763313609467456e-05, + "loss": 0.9082, + "step": 2926, + "task_loss": 1.361264944076538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8328543901443481, + "epoch": 2.47, + "learning_rate": 3.7628909551986475e-05, + "loss": 0.7689, + "step": 2927, + "task_loss": 0.6862871050834656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40595555305480957, + "epoch": 2.47, + "learning_rate": 3.7624683009298395e-05, + "loss": 0.8602, + "step": 2928, + "task_loss": 0.661162257194519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4726664423942566, + "epoch": 2.48, + "learning_rate": 3.7620456466610315e-05, + "loss": 0.8362, + "step": 2929, + "task_loss": 0.3763027489185333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26753073930740356, + "epoch": 2.48, + "learning_rate": 3.7616229923922234e-05, + "loss": 0.5996, + "step": 2930, + "task_loss": 0.5454975962638855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6390641927719116, + "epoch": 2.48, + "learning_rate": 3.7612003381234154e-05, + "loss": 0.6612, + "step": 2931, + "task_loss": 0.19475023448467255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2549428939819336, + "epoch": 2.48, + "learning_rate": 3.7607776838546074e-05, + "loss": 1.0256, + "step": 2932, + "task_loss": 0.9723878502845764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0699330568313599, + "epoch": 2.48, + "learning_rate": 3.7603550295857994e-05, + "loss": 0.8021, + "step": 2933, + "task_loss": 0.6617919206619263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8121709227561951, + "epoch": 2.48, + "learning_rate": 3.7599323753169907e-05, + "loss": 0.9196, + "step": 2934, + "task_loss": 1.1484935283660889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5095109343528748, + "epoch": 2.48, + "learning_rate": 3.7595097210481826e-05, + "loss": 0.8336, + "step": 2935, + "task_loss": 0.039555761963129044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8502115607261658, + "epoch": 2.48, + "learning_rate": 3.7590870667793746e-05, + "loss": 0.7967, + "step": 2936, + "task_loss": 0.2476097047328949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1505329608917236, + "epoch": 2.48, + "learning_rate": 3.7586644125105666e-05, + "loss": 1.0698, + "step": 2937, + "task_loss": 0.5473749041557312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39872610569000244, + "epoch": 2.48, + "learning_rate": 3.7582417582417586e-05, + "loss": 0.8293, + "step": 2938, + "task_loss": 0.06977615505456924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.145108699798584, + "epoch": 2.48, + "learning_rate": 3.7578191039729505e-05, + "loss": 1.1745, + "step": 2939, + "task_loss": 1.9666355848312378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9140480756759644, + "epoch": 2.48, + "learning_rate": 3.757396449704142e-05, + "loss": 0.7633, + "step": 2940, + "task_loss": 0.8574177622795105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1026582717895508, + "epoch": 2.49, + "learning_rate": 3.756973795435334e-05, + "loss": 0.8203, + "step": 2941, + "task_loss": 1.4931082725524902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1289174556732178, + "epoch": 2.49, + "learning_rate": 3.756551141166526e-05, + "loss": 0.9108, + "step": 2942, + "task_loss": 1.2095997333526611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6390035152435303, + "epoch": 2.49, + "learning_rate": 3.7561284868977184e-05, + "loss": 1.1012, + "step": 2943, + "task_loss": 0.9534785747528076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6785563230514526, + "epoch": 2.49, + "learning_rate": 3.75570583262891e-05, + "loss": 0.7194, + "step": 2944, + "task_loss": 0.5434262752532959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2151813507080078, + "epoch": 2.49, + "learning_rate": 3.755283178360102e-05, + "loss": 0.849, + "step": 2945, + "task_loss": 0.9139522910118103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6218034029006958, + "epoch": 2.49, + "learning_rate": 3.754860524091294e-05, + "loss": 0.8019, + "step": 2946, + "task_loss": 1.1818902492523193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1467573642730713, + "epoch": 2.49, + "learning_rate": 3.754437869822485e-05, + "loss": 0.8421, + "step": 2947, + "task_loss": 1.4039177894592285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7867821455001831, + "epoch": 2.49, + "learning_rate": 3.7540152155536776e-05, + "loss": 0.8159, + "step": 2948, + "task_loss": 0.5303400754928589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9818558692932129, + "epoch": 2.49, + "learning_rate": 3.7535925612848696e-05, + "loss": 0.9777, + "step": 2949, + "task_loss": 0.9032993912696838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7279269099235535, + "epoch": 2.49, + "learning_rate": 3.753169907016061e-05, + "loss": 0.7193, + "step": 2950, + "task_loss": 0.6661729216575623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2358300685882568, + "epoch": 2.49, + "learning_rate": 3.752747252747253e-05, + "loss": 1.0239, + "step": 2951, + "task_loss": 0.7185297608375549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7788811922073364, + "epoch": 2.5, + "learning_rate": 3.752324598478445e-05, + "loss": 1.0897, + "step": 2952, + "task_loss": 1.0462582111358643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7787549495697021, + "epoch": 2.5, + "learning_rate": 3.751901944209637e-05, + "loss": 0.8269, + "step": 2953, + "task_loss": 0.3610954284667969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7916620969772339, + "epoch": 2.5, + "learning_rate": 3.751479289940829e-05, + "loss": 0.9498, + "step": 2954, + "task_loss": 0.3820704221725464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7843740582466125, + "epoch": 2.5, + "learning_rate": 3.751056635672021e-05, + "loss": 0.8804, + "step": 2955, + "task_loss": 1.3656545877456665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9761601686477661, + "epoch": 2.5, + "learning_rate": 3.750633981403212e-05, + "loss": 1.4578, + "step": 2956, + "task_loss": 0.8883195519447327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5347753763198853, + "epoch": 2.5, + "learning_rate": 3.750211327134404e-05, + "loss": 1.11, + "step": 2957, + "task_loss": 1.038547158241272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7983033657073975, + "epoch": 2.5, + "learning_rate": 3.749788672865596e-05, + "loss": 0.7831, + "step": 2958, + "task_loss": 0.8317264914512634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5666301250457764, + "epoch": 2.5, + "learning_rate": 3.749366018596788e-05, + "loss": 0.659, + "step": 2959, + "task_loss": 0.2887917160987854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5068851113319397, + "epoch": 2.5, + "learning_rate": 3.74894336432798e-05, + "loss": 0.8101, + "step": 2960, + "task_loss": 0.15603429079055786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9002972841262817, + "epoch": 2.5, + "learning_rate": 3.748520710059172e-05, + "loss": 0.8178, + "step": 2961, + "task_loss": 0.998735249042511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5958905220031738, + "epoch": 2.5, + "learning_rate": 3.748098055790364e-05, + "loss": 0.7454, + "step": 2962, + "task_loss": 1.4648840427398682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6624780893325806, + "epoch": 2.5, + "learning_rate": 3.747675401521555e-05, + "loss": 0.6671, + "step": 2963, + "task_loss": 0.7569023370742798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.987747073173523, + "epoch": 2.51, + "learning_rate": 3.747252747252747e-05, + "loss": 0.7604, + "step": 2964, + "task_loss": 0.9567808508872986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8434939384460449, + "epoch": 2.51, + "learning_rate": 3.74683009298394e-05, + "loss": 0.7836, + "step": 2965, + "task_loss": 0.4334939420223236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5708608627319336, + "epoch": 2.51, + "learning_rate": 3.746407438715131e-05, + "loss": 0.605, + "step": 2966, + "task_loss": 1.0584946870803833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.516164243221283, + "epoch": 2.51, + "learning_rate": 3.745984784446323e-05, + "loss": 0.69, + "step": 2967, + "task_loss": 0.7146093845367432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.048117756843567, + "epoch": 2.51, + "learning_rate": 3.745562130177515e-05, + "loss": 0.7239, + "step": 2968, + "task_loss": 0.7731064558029175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8396795988082886, + "epoch": 2.51, + "learning_rate": 3.7451394759087064e-05, + "loss": 0.8928, + "step": 2969, + "task_loss": 0.8537657260894775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0115752220153809, + "epoch": 2.51, + "learning_rate": 3.744716821639899e-05, + "loss": 1.0599, + "step": 2970, + "task_loss": 1.3865740299224854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35942721366882324, + "epoch": 2.51, + "learning_rate": 3.744294167371091e-05, + "loss": 0.8565, + "step": 2971, + "task_loss": 0.16102173924446106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6482678651809692, + "epoch": 2.51, + "learning_rate": 3.743871513102282e-05, + "loss": 0.8869, + "step": 2972, + "task_loss": 0.48032504320144653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7061709761619568, + "epoch": 2.51, + "learning_rate": 3.743448858833474e-05, + "loss": 0.8946, + "step": 2973, + "task_loss": 1.0742160081863403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1326513290405273, + "epoch": 2.51, + "learning_rate": 3.743026204564666e-05, + "loss": 0.8756, + "step": 2974, + "task_loss": 0.41850972175598145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0805472135543823, + "epoch": 2.51, + "learning_rate": 3.742603550295858e-05, + "loss": 0.9646, + "step": 2975, + "task_loss": 1.9752739667892456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.032296895980835, + "epoch": 2.52, + "learning_rate": 3.74218089602705e-05, + "loss": 0.8872, + "step": 2976, + "task_loss": 1.311806559562683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4265739321708679, + "epoch": 2.52, + "learning_rate": 3.741758241758242e-05, + "loss": 0.8001, + "step": 2977, + "task_loss": 0.1314382553100586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9535980820655823, + "epoch": 2.52, + "learning_rate": 3.741335587489434e-05, + "loss": 0.7161, + "step": 2978, + "task_loss": 0.531485915184021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.8080883026123047, + "epoch": 2.52, + "learning_rate": 3.7409129332206254e-05, + "loss": 1.0579, + "step": 2979, + "task_loss": 0.7735041379928589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4333162307739258, + "epoch": 2.52, + "learning_rate": 3.7404902789518174e-05, + "loss": 0.8919, + "step": 2980, + "task_loss": 1.221501350402832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9503706097602844, + "epoch": 2.52, + "learning_rate": 3.7400676246830094e-05, + "loss": 1.0048, + "step": 2981, + "task_loss": 0.3086860179901123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.629895806312561, + "epoch": 2.52, + "learning_rate": 3.739644970414201e-05, + "loss": 0.7436, + "step": 2982, + "task_loss": 0.11467395722866058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7233576774597168, + "epoch": 2.52, + "learning_rate": 3.739222316145393e-05, + "loss": 0.7818, + "step": 2983, + "task_loss": 0.5050192475318909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.792982816696167, + "epoch": 2.52, + "learning_rate": 3.738799661876585e-05, + "loss": 0.974, + "step": 2984, + "task_loss": 0.8836943507194519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5706294775009155, + "epoch": 2.52, + "learning_rate": 3.7383770076077766e-05, + "loss": 0.6983, + "step": 2985, + "task_loss": 0.3576892614364624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8437615036964417, + "epoch": 2.52, + "learning_rate": 3.7379543533389686e-05, + "loss": 0.7512, + "step": 2986, + "task_loss": 0.7124462723731995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6693239212036133, + "epoch": 2.52, + "learning_rate": 3.737531699070161e-05, + "loss": 0.8559, + "step": 2987, + "task_loss": 0.6283877491950989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5808197855949402, + "epoch": 2.53, + "learning_rate": 3.737109044801353e-05, + "loss": 0.7049, + "step": 2988, + "task_loss": 0.7324851155281067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9331092834472656, + "epoch": 2.53, + "learning_rate": 3.7366863905325445e-05, + "loss": 0.8273, + "step": 2989, + "task_loss": 0.6153925061225891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7532095909118652, + "epoch": 2.53, + "learning_rate": 3.7362637362637365e-05, + "loss": 0.6142, + "step": 2990, + "task_loss": 0.6104395389556885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9392920732498169, + "epoch": 2.53, + "learning_rate": 3.7358410819949284e-05, + "loss": 0.815, + "step": 2991, + "task_loss": 1.2604948282241821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6861684322357178, + "epoch": 2.53, + "learning_rate": 3.73541842772612e-05, + "loss": 0.9983, + "step": 2992, + "task_loss": 1.0882362127304077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0361250638961792, + "epoch": 2.53, + "learning_rate": 3.7349957734573124e-05, + "loss": 0.9349, + "step": 2993, + "task_loss": 1.1101155281066895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0074084997177124, + "epoch": 2.53, + "learning_rate": 3.7345731191885043e-05, + "loss": 0.7469, + "step": 2994, + "task_loss": 1.1420098543167114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3461261987686157, + "epoch": 2.53, + "learning_rate": 3.7341504649196956e-05, + "loss": 0.893, + "step": 2995, + "task_loss": 0.8898110389709473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6376475095748901, + "epoch": 2.53, + "learning_rate": 3.7337278106508876e-05, + "loss": 0.8891, + "step": 2996, + "task_loss": 1.161588430404663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.9966105222702026, + "epoch": 2.53, + "learning_rate": 3.7333051563820796e-05, + "loss": 1.0386, + "step": 2997, + "task_loss": 1.0232625007629395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.771682858467102, + "epoch": 2.53, + "learning_rate": 3.7328825021132716e-05, + "loss": 0.8376, + "step": 2998, + "task_loss": 0.8959367275238037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.128648281097412, + "epoch": 2.53, + "learning_rate": 3.7324598478444635e-05, + "loss": 1.1336, + "step": 2999, + "task_loss": 1.08405339717865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.165360927581787, + "epoch": 2.54, + "learning_rate": 3.7320371935756555e-05, + "loss": 0.9728, + "step": 3000, + "task_loss": 0.2984207570552826 + }, + { + "epoch": 2.54, + "eval_accuracy": 0.8767128712871287, + "eval_loss": 0.5263227820396423, + "eval_runtime": 229.1154, + "eval_samples_per_second": 110.206, + "eval_steps_per_second": 0.864, + "step": 3000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6945322751998901, + "epoch": 2.54, + "learning_rate": 3.731614539306847e-05, + "loss": 0.7383, + "step": 3001, + "task_loss": 0.339668333530426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1984777450561523, + "epoch": 2.54, + "learning_rate": 3.731191885038039e-05, + "loss": 0.8861, + "step": 3002, + "task_loss": 1.7218049764633179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6091169118881226, + "epoch": 2.54, + "learning_rate": 3.730769230769231e-05, + "loss": 1.0612, + "step": 3003, + "task_loss": 0.9124078154563904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3853226900100708, + "epoch": 2.54, + "learning_rate": 3.7303465765004234e-05, + "loss": 0.8872, + "step": 3004, + "task_loss": 0.09558850526809692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8512963652610779, + "epoch": 2.54, + "learning_rate": 3.729923922231615e-05, + "loss": 0.774, + "step": 3005, + "task_loss": 1.222814917564392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8113721609115601, + "epoch": 2.54, + "learning_rate": 3.729501267962807e-05, + "loss": 0.8248, + "step": 3006, + "task_loss": 1.6954820156097412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6979273557662964, + "epoch": 2.54, + "learning_rate": 3.7290786136939987e-05, + "loss": 0.7942, + "step": 3007, + "task_loss": 0.5400323271751404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8330560922622681, + "epoch": 2.54, + "learning_rate": 3.72865595942519e-05, + "loss": 0.8045, + "step": 3008, + "task_loss": 0.7397099137306213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8663599491119385, + "epoch": 2.54, + "learning_rate": 3.728233305156382e-05, + "loss": 0.8128, + "step": 3009, + "task_loss": 1.2098785638809204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8310542106628418, + "epoch": 2.54, + "learning_rate": 3.7278106508875746e-05, + "loss": 0.7465, + "step": 3010, + "task_loss": 1.1796890497207642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0074374675750732, + "epoch": 2.54, + "learning_rate": 3.727387996618766e-05, + "loss": 0.8066, + "step": 3011, + "task_loss": 0.8589324355125427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5980572700500488, + "epoch": 2.55, + "learning_rate": 3.726965342349958e-05, + "loss": 0.6101, + "step": 3012, + "task_loss": 0.5425428152084351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6983373165130615, + "epoch": 2.55, + "learning_rate": 3.72654268808115e-05, + "loss": 0.7695, + "step": 3013, + "task_loss": 0.658515214920044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.256879210472107, + "epoch": 2.55, + "learning_rate": 3.726120033812341e-05, + "loss": 0.8399, + "step": 3014, + "task_loss": 1.0945231914520264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.676827073097229, + "epoch": 2.55, + "learning_rate": 3.725697379543534e-05, + "loss": 0.9211, + "step": 3015, + "task_loss": 0.5654138326644897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7911100387573242, + "epoch": 2.55, + "learning_rate": 3.725274725274726e-05, + "loss": 0.947, + "step": 3016, + "task_loss": 0.8976695537567139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7871742248535156, + "epoch": 2.55, + "learning_rate": 3.724852071005918e-05, + "loss": 1.097, + "step": 3017, + "task_loss": 1.3124561309814453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.759321928024292, + "epoch": 2.55, + "learning_rate": 3.724429416737109e-05, + "loss": 0.6181, + "step": 3018, + "task_loss": 0.601523220539093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.903558075428009, + "epoch": 2.55, + "learning_rate": 3.724006762468301e-05, + "loss": 0.8285, + "step": 3019, + "task_loss": 1.2055823802947998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6516184210777283, + "epoch": 2.55, + "learning_rate": 3.723584108199493e-05, + "loss": 0.6495, + "step": 3020, + "task_loss": 0.5704653263092041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6494377851486206, + "epoch": 2.55, + "learning_rate": 3.723161453930685e-05, + "loss": 0.7964, + "step": 3021, + "task_loss": 0.5845313668251038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0027871131896973, + "epoch": 2.55, + "learning_rate": 3.722738799661877e-05, + "loss": 0.8559, + "step": 3022, + "task_loss": 1.3712655305862427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6782982349395752, + "epoch": 2.56, + "learning_rate": 3.722316145393069e-05, + "loss": 0.8091, + "step": 3023, + "task_loss": 1.5369762182235718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9717000126838684, + "epoch": 2.56, + "learning_rate": 3.72189349112426e-05, + "loss": 0.7861, + "step": 3024, + "task_loss": 0.5329558849334717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9537562131881714, + "epoch": 2.56, + "learning_rate": 3.721470836855452e-05, + "loss": 0.7164, + "step": 3025, + "task_loss": 0.9149600267410278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7581993937492371, + "epoch": 2.56, + "learning_rate": 3.721048182586644e-05, + "loss": 0.8483, + "step": 3026, + "task_loss": 0.8229069113731384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8546748757362366, + "epoch": 2.56, + "learning_rate": 3.720625528317836e-05, + "loss": 0.7743, + "step": 3027, + "task_loss": 0.7973331212997437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5849636793136597, + "epoch": 2.56, + "learning_rate": 3.720202874049028e-05, + "loss": 0.6431, + "step": 3028, + "task_loss": 0.3173595368862152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41338205337524414, + "epoch": 2.56, + "learning_rate": 3.71978021978022e-05, + "loss": 0.8066, + "step": 3029, + "task_loss": 0.9902318716049194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.916282057762146, + "epoch": 2.56, + "learning_rate": 3.7193575655114113e-05, + "loss": 0.8332, + "step": 3030, + "task_loss": 0.9232667684555054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5028959512710571, + "epoch": 2.56, + "learning_rate": 3.718934911242603e-05, + "loss": 0.7522, + "step": 3031, + "task_loss": 0.7833327651023865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.824494481086731, + "epoch": 2.56, + "learning_rate": 3.718512256973796e-05, + "loss": 0.9899, + "step": 3032, + "task_loss": 1.2010328769683838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7857167720794678, + "epoch": 2.56, + "learning_rate": 3.718089602704988e-05, + "loss": 0.9092, + "step": 3033, + "task_loss": 1.1328927278518677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.192674994468689, + "epoch": 2.56, + "learning_rate": 3.717666948436179e-05, + "loss": 0.9829, + "step": 3034, + "task_loss": 2.6403465270996094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7629928588867188, + "epoch": 2.57, + "learning_rate": 3.717244294167371e-05, + "loss": 0.8618, + "step": 3035, + "task_loss": 1.6077911853790283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.601716160774231, + "epoch": 2.57, + "learning_rate": 3.716821639898563e-05, + "loss": 0.7449, + "step": 3036, + "task_loss": 1.882880449295044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33205413818359375, + "epoch": 2.57, + "learning_rate": 3.716398985629755e-05, + "loss": 0.677, + "step": 3037, + "task_loss": 0.6416762471199036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6947457790374756, + "epoch": 2.57, + "learning_rate": 3.715976331360947e-05, + "loss": 0.6151, + "step": 3038, + "task_loss": 1.2570137977600098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9590944647789001, + "epoch": 2.57, + "learning_rate": 3.715553677092139e-05, + "loss": 0.7763, + "step": 3039, + "task_loss": 1.5594828128814697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6918559074401855, + "epoch": 2.57, + "learning_rate": 3.7151310228233304e-05, + "loss": 0.8121, + "step": 3040, + "task_loss": 1.1324849128723145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9227808713912964, + "epoch": 2.57, + "learning_rate": 3.7147083685545224e-05, + "loss": 0.8574, + "step": 3041, + "task_loss": 1.1076278686523438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6765317916870117, + "epoch": 2.57, + "learning_rate": 3.7142857142857143e-05, + "loss": 0.8381, + "step": 3042, + "task_loss": 1.0900061130523682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.9176989793777466, + "epoch": 2.57, + "learning_rate": 3.713863060016906e-05, + "loss": 1.2098, + "step": 3043, + "task_loss": 1.4418178796768188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1096810102462769, + "epoch": 2.57, + "learning_rate": 3.713440405748098e-05, + "loss": 1.0082, + "step": 3044, + "task_loss": 0.8642387390136719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0091372728347778, + "epoch": 2.57, + "learning_rate": 3.71301775147929e-05, + "loss": 0.9573, + "step": 3045, + "task_loss": 0.8251848220825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8770987391471863, + "epoch": 2.57, + "learning_rate": 3.712595097210482e-05, + "loss": 0.8056, + "step": 3046, + "task_loss": 0.43860915303230286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8536374568939209, + "epoch": 2.58, + "learning_rate": 3.7121724429416735e-05, + "loss": 0.8514, + "step": 3047, + "task_loss": 0.5220704674720764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8492565155029297, + "epoch": 2.58, + "learning_rate": 3.7117497886728655e-05, + "loss": 0.9157, + "step": 3048, + "task_loss": 1.6889747381210327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6147288680076599, + "epoch": 2.58, + "learning_rate": 3.711327134404058e-05, + "loss": 0.7799, + "step": 3049, + "task_loss": 1.0682823657989502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 2.528697967529297, + "epoch": 2.58, + "learning_rate": 3.7109044801352495e-05, + "loss": 1.3538, + "step": 3050, + "task_loss": 1.2485682964324951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2987728118896484, + "epoch": 2.58, + "learning_rate": 3.7104818258664414e-05, + "loss": 0.9158, + "step": 3051, + "task_loss": 1.3291471004486084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.193570852279663, + "epoch": 2.58, + "learning_rate": 3.7100591715976334e-05, + "loss": 0.9452, + "step": 3052, + "task_loss": 0.9564276933670044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5314393043518066, + "epoch": 2.58, + "learning_rate": 3.709636517328825e-05, + "loss": 0.8745, + "step": 3053, + "task_loss": 0.9978659152984619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.655224084854126, + "epoch": 2.58, + "learning_rate": 3.7092138630600174e-05, + "loss": 0.7079, + "step": 3054, + "task_loss": 0.8744780421257019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9647119045257568, + "epoch": 2.58, + "learning_rate": 3.708791208791209e-05, + "loss": 0.97, + "step": 3055, + "task_loss": 1.4981162548065186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1862475872039795, + "epoch": 2.58, + "learning_rate": 3.7083685545224006e-05, + "loss": 0.7877, + "step": 3056, + "task_loss": 0.99953693151474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3654099106788635, + "epoch": 2.58, + "learning_rate": 3.7079459002535926e-05, + "loss": 0.6332, + "step": 3057, + "task_loss": 0.5118232369422913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6309930086135864, + "epoch": 2.58, + "learning_rate": 3.7075232459847846e-05, + "loss": 0.7779, + "step": 3058, + "task_loss": 1.5678181648254395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3080609440803528, + "epoch": 2.59, + "learning_rate": 3.7071005917159765e-05, + "loss": 0.5809, + "step": 3059, + "task_loss": 0.07553819566965103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4632693827152252, + "epoch": 2.59, + "learning_rate": 3.7066779374471685e-05, + "loss": 0.7916, + "step": 3060, + "task_loss": 0.9021651744842529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7074669599533081, + "epoch": 2.59, + "learning_rate": 3.7062552831783605e-05, + "loss": 0.7534, + "step": 3061, + "task_loss": 0.7439253926277161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7454609870910645, + "epoch": 2.59, + "learning_rate": 3.7058326289095525e-05, + "loss": 0.8225, + "step": 3062, + "task_loss": 1.0032330751419067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7973410487174988, + "epoch": 2.59, + "learning_rate": 3.705409974640744e-05, + "loss": 0.8569, + "step": 3063, + "task_loss": 1.0491392612457275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.993553876876831, + "epoch": 2.59, + "learning_rate": 3.704987320371936e-05, + "loss": 1.0378, + "step": 3064, + "task_loss": 1.0859644412994385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0121476650238037, + "epoch": 2.59, + "learning_rate": 3.704564666103128e-05, + "loss": 0.8503, + "step": 3065, + "task_loss": 0.9676469564437866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7435230016708374, + "epoch": 2.59, + "learning_rate": 3.70414201183432e-05, + "loss": 0.6776, + "step": 3066, + "task_loss": 0.31943991780281067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8353585004806519, + "epoch": 2.59, + "learning_rate": 3.7037193575655117e-05, + "loss": 0.7704, + "step": 3067, + "task_loss": 0.6388733983039856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7831616997718811, + "epoch": 2.59, + "learning_rate": 3.7032967032967036e-05, + "loss": 0.8672, + "step": 3068, + "task_loss": 0.8999489545822144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7162842154502869, + "epoch": 2.59, + "learning_rate": 3.702874049027895e-05, + "loss": 0.7685, + "step": 3069, + "task_loss": 1.4781559705734253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1050009727478027, + "epoch": 2.59, + "learning_rate": 3.702451394759087e-05, + "loss": 0.8955, + "step": 3070, + "task_loss": 1.096408724784851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0002341270446777, + "epoch": 2.6, + "learning_rate": 3.7020287404902796e-05, + "loss": 0.9608, + "step": 3071, + "task_loss": 1.78266179561615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0992339849472046, + "epoch": 2.6, + "learning_rate": 3.701606086221471e-05, + "loss": 0.9172, + "step": 3072, + "task_loss": 0.5865455269813538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.010217308998108, + "epoch": 2.6, + "learning_rate": 3.701183431952663e-05, + "loss": 1.0825, + "step": 3073, + "task_loss": 1.2458832263946533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.130928874015808, + "epoch": 2.6, + "learning_rate": 3.700760777683855e-05, + "loss": 0.9746, + "step": 3074, + "task_loss": 0.8048464059829712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5870479941368103, + "epoch": 2.6, + "learning_rate": 3.700338123415047e-05, + "loss": 0.6023, + "step": 3075, + "task_loss": 0.7509044408798218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5582225918769836, + "epoch": 2.6, + "learning_rate": 3.699915469146239e-05, + "loss": 0.7512, + "step": 3076, + "task_loss": 1.2097479104995728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5444167852401733, + "epoch": 2.6, + "learning_rate": 3.699492814877431e-05, + "loss": 0.9537, + "step": 3077, + "task_loss": 1.0808779001235962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6360112428665161, + "epoch": 2.6, + "learning_rate": 3.699070160608623e-05, + "loss": 0.677, + "step": 3078, + "task_loss": 0.7513692378997803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5899777412414551, + "epoch": 2.6, + "learning_rate": 3.698647506339814e-05, + "loss": 0.7193, + "step": 3079, + "task_loss": 1.1273689270019531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6693968176841736, + "epoch": 2.6, + "learning_rate": 3.698224852071006e-05, + "loss": 0.986, + "step": 3080, + "task_loss": 0.14810732007026672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1231316328048706, + "epoch": 2.6, + "learning_rate": 3.697802197802198e-05, + "loss": 0.7848, + "step": 3081, + "task_loss": 0.6202142238616943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8374344110488892, + "epoch": 2.6, + "learning_rate": 3.69737954353339e-05, + "loss": 0.7082, + "step": 3082, + "task_loss": 1.0106689929962158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4730343818664551, + "epoch": 2.61, + "learning_rate": 3.696956889264582e-05, + "loss": 0.5381, + "step": 3083, + "task_loss": 0.5265515446662903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9261061549186707, + "epoch": 2.61, + "learning_rate": 3.696534234995774e-05, + "loss": 0.9142, + "step": 3084, + "task_loss": 1.6120854616165161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8333394527435303, + "epoch": 2.61, + "learning_rate": 3.696111580726965e-05, + "loss": 0.734, + "step": 3085, + "task_loss": 0.7873744964599609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5093247294425964, + "epoch": 2.61, + "learning_rate": 3.695688926458157e-05, + "loss": 0.6228, + "step": 3086, + "task_loss": 0.41594576835632324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5116265416145325, + "epoch": 2.61, + "learning_rate": 3.695266272189349e-05, + "loss": 0.7679, + "step": 3087, + "task_loss": 0.572145402431488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8843633532524109, + "epoch": 2.61, + "learning_rate": 3.694843617920541e-05, + "loss": 0.8531, + "step": 3088, + "task_loss": 1.0680478811264038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9782377481460571, + "epoch": 2.61, + "learning_rate": 3.694420963651733e-05, + "loss": 0.8237, + "step": 3089, + "task_loss": 0.8365287184715271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.888074517250061, + "epoch": 2.61, + "learning_rate": 3.693998309382925e-05, + "loss": 0.8532, + "step": 3090, + "task_loss": 0.8413580656051636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7443833351135254, + "epoch": 2.61, + "learning_rate": 3.693575655114117e-05, + "loss": 0.6207, + "step": 3091, + "task_loss": 0.16518382728099823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6082497835159302, + "epoch": 2.61, + "learning_rate": 3.693153000845308e-05, + "loss": 0.6569, + "step": 3092, + "task_loss": 1.1876845359802246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9244512319564819, + "epoch": 2.61, + "learning_rate": 3.692730346576501e-05, + "loss": 0.86, + "step": 3093, + "task_loss": 1.145851969718933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2092303037643433, + "epoch": 2.61, + "learning_rate": 3.692307692307693e-05, + "loss": 0.8806, + "step": 3094, + "task_loss": 1.143178105354309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.554682195186615, + "epoch": 2.62, + "learning_rate": 3.691885038038884e-05, + "loss": 1.0177, + "step": 3095, + "task_loss": 1.0971779823303223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5559208393096924, + "epoch": 2.62, + "learning_rate": 3.691462383770076e-05, + "loss": 0.7795, + "step": 3096, + "task_loss": 0.5764684081077576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7125136852264404, + "epoch": 2.62, + "learning_rate": 3.691039729501268e-05, + "loss": 0.699, + "step": 3097, + "task_loss": 0.8046870827674866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.736830472946167, + "epoch": 2.62, + "learning_rate": 3.69061707523246e-05, + "loss": 0.7584, + "step": 3098, + "task_loss": 1.0614959001541138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6122874617576599, + "epoch": 2.62, + "learning_rate": 3.690194420963652e-05, + "loss": 0.7071, + "step": 3099, + "task_loss": 0.9087220430374146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7354944944381714, + "epoch": 2.62, + "learning_rate": 3.689771766694844e-05, + "loss": 0.8316, + "step": 3100, + "task_loss": 0.5507137775421143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0724760293960571, + "epoch": 2.62, + "learning_rate": 3.6893491124260354e-05, + "loss": 0.7883, + "step": 3101, + "task_loss": 1.7109241485595703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8051074147224426, + "epoch": 2.62, + "learning_rate": 3.6889264581572274e-05, + "loss": 0.9858, + "step": 3102, + "task_loss": 1.4117804765701294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35728687047958374, + "epoch": 2.62, + "learning_rate": 3.688503803888419e-05, + "loss": 0.6221, + "step": 3103, + "task_loss": 0.13781708478927612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8510875701904297, + "epoch": 2.62, + "learning_rate": 3.688081149619611e-05, + "loss": 0.7583, + "step": 3104, + "task_loss": 0.6962308883666992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5689719915390015, + "epoch": 2.62, + "learning_rate": 3.687658495350803e-05, + "loss": 0.841, + "step": 3105, + "task_loss": 0.19332218170166016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49652373790740967, + "epoch": 2.63, + "learning_rate": 3.687235841081995e-05, + "loss": 0.7055, + "step": 3106, + "task_loss": 0.3223428428173065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8462597131729126, + "epoch": 2.63, + "learning_rate": 3.686813186813187e-05, + "loss": 0.7816, + "step": 3107, + "task_loss": 0.37198585271835327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8038350343704224, + "epoch": 2.63, + "learning_rate": 3.6863905325443785e-05, + "loss": 0.8849, + "step": 3108, + "task_loss": 0.8472291231155396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8491611480712891, + "epoch": 2.63, + "learning_rate": 3.6859678782755705e-05, + "loss": 0.7523, + "step": 3109, + "task_loss": 1.155708909034729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9595373272895813, + "epoch": 2.63, + "learning_rate": 3.685545224006763e-05, + "loss": 0.7857, + "step": 3110, + "task_loss": 0.40872299671173096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5706314444541931, + "epoch": 2.63, + "learning_rate": 3.6851225697379544e-05, + "loss": 0.7751, + "step": 3111, + "task_loss": 1.0269041061401367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8550041913986206, + "epoch": 2.63, + "learning_rate": 3.6846999154691464e-05, + "loss": 0.7515, + "step": 3112, + "task_loss": 0.665654718875885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4683772921562195, + "epoch": 2.63, + "learning_rate": 3.6842772612003384e-05, + "loss": 0.6526, + "step": 3113, + "task_loss": 0.5071851015090942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9408434629440308, + "epoch": 2.63, + "learning_rate": 3.68385460693153e-05, + "loss": 0.6859, + "step": 3114, + "task_loss": 1.917077660560608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3868627548217773, + "epoch": 2.63, + "learning_rate": 3.6834319526627223e-05, + "loss": 1.1517, + "step": 3115, + "task_loss": 0.8969753384590149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6709154844284058, + "epoch": 2.63, + "learning_rate": 3.683009298393914e-05, + "loss": 0.7277, + "step": 3116, + "task_loss": 0.8024963736534119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6856657266616821, + "epoch": 2.63, + "learning_rate": 3.6825866441251056e-05, + "loss": 0.7587, + "step": 3117, + "task_loss": 1.1278657913208008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4586241245269775, + "epoch": 2.64, + "learning_rate": 3.6821639898562976e-05, + "loss": 1.0479, + "step": 3118, + "task_loss": 2.441871404647827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38083067536354065, + "epoch": 2.64, + "learning_rate": 3.6817413355874896e-05, + "loss": 0.6188, + "step": 3119, + "task_loss": 0.14536947011947632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.266404926776886, + "epoch": 2.64, + "learning_rate": 3.6813186813186815e-05, + "loss": 0.7, + "step": 3120, + "task_loss": 0.01973012648522854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4878993034362793, + "epoch": 2.64, + "learning_rate": 3.6808960270498735e-05, + "loss": 0.6608, + "step": 3121, + "task_loss": 0.45334842801094055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7327946424484253, + "epoch": 2.64, + "learning_rate": 3.6804733727810655e-05, + "loss": 0.7115, + "step": 3122, + "task_loss": 0.09124201536178589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.828135073184967, + "epoch": 2.64, + "learning_rate": 3.6800507185122575e-05, + "loss": 0.9087, + "step": 3123, + "task_loss": 0.6044538617134094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6660695672035217, + "epoch": 2.64, + "learning_rate": 3.679628064243449e-05, + "loss": 0.6791, + "step": 3124, + "task_loss": 0.637639582157135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9629924893379211, + "epoch": 2.64, + "learning_rate": 3.679205409974641e-05, + "loss": 0.8396, + "step": 3125, + "task_loss": 0.39505887031555176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4094371795654297, + "epoch": 2.64, + "learning_rate": 3.678782755705833e-05, + "loss": 0.812, + "step": 3126, + "task_loss": 0.2473873496055603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5572298765182495, + "epoch": 2.64, + "learning_rate": 3.678360101437025e-05, + "loss": 0.6219, + "step": 3127, + "task_loss": 0.7138562798500061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9096237421035767, + "epoch": 2.64, + "learning_rate": 3.6779374471682166e-05, + "loss": 0.9448, + "step": 3128, + "task_loss": 0.7358607649803162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6246610879898071, + "epoch": 2.64, + "learning_rate": 3.6775147928994086e-05, + "loss": 1.0002, + "step": 3129, + "task_loss": 0.6177615523338318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8174480199813843, + "epoch": 2.65, + "learning_rate": 3.6770921386306e-05, + "loss": 0.6996, + "step": 3130, + "task_loss": 1.3445311784744263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7565032243728638, + "epoch": 2.65, + "learning_rate": 3.676669484361792e-05, + "loss": 0.8786, + "step": 3131, + "task_loss": 0.7241485714912415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5404148101806641, + "epoch": 2.65, + "learning_rate": 3.6762468300929845e-05, + "loss": 0.8668, + "step": 3132, + "task_loss": 0.15434055030345917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9013053774833679, + "epoch": 2.65, + "learning_rate": 3.6758241758241765e-05, + "loss": 0.8664, + "step": 3133, + "task_loss": 0.5697086453437805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5654147863388062, + "epoch": 2.65, + "learning_rate": 3.675401521555368e-05, + "loss": 0.8082, + "step": 3134, + "task_loss": 1.0989187955856323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.82607102394104, + "epoch": 2.65, + "learning_rate": 3.67497886728656e-05, + "loss": 1.0775, + "step": 3135, + "task_loss": 1.3307024240493774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6478612422943115, + "epoch": 2.65, + "learning_rate": 3.674556213017752e-05, + "loss": 0.699, + "step": 3136, + "task_loss": 0.6293437480926514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6773472428321838, + "epoch": 2.65, + "learning_rate": 3.674133558748943e-05, + "loss": 0.7931, + "step": 3137, + "task_loss": 0.808694064617157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3730204403400421, + "epoch": 2.65, + "learning_rate": 3.673710904480136e-05, + "loss": 0.7924, + "step": 3138, + "task_loss": 0.856404721736908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1635000705718994, + "epoch": 2.65, + "learning_rate": 3.673288250211328e-05, + "loss": 0.9082, + "step": 3139, + "task_loss": 1.7407945394515991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3419901132583618, + "epoch": 2.65, + "learning_rate": 3.672865595942519e-05, + "loss": 1.0011, + "step": 3140, + "task_loss": 1.4921413660049438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8441625237464905, + "epoch": 2.65, + "learning_rate": 3.672442941673711e-05, + "loss": 0.8424, + "step": 3141, + "task_loss": 1.1122411489486694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4720938205718994, + "epoch": 2.66, + "learning_rate": 3.672020287404903e-05, + "loss": 0.816, + "step": 3142, + "task_loss": 0.20232322812080383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8524852395057678, + "epoch": 2.66, + "learning_rate": 3.671597633136095e-05, + "loss": 0.7595, + "step": 3143, + "task_loss": 1.4422588348388672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46059346199035645, + "epoch": 2.66, + "learning_rate": 3.671174978867287e-05, + "loss": 0.6115, + "step": 3144, + "task_loss": 0.1372869461774826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9297634363174438, + "epoch": 2.66, + "learning_rate": 3.670752324598479e-05, + "loss": 0.8001, + "step": 3145, + "task_loss": 1.1383341550827026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5038588047027588, + "epoch": 2.66, + "learning_rate": 3.67032967032967e-05, + "loss": 0.7782, + "step": 3146, + "task_loss": 1.093130111694336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1263211965560913, + "epoch": 2.66, + "learning_rate": 3.669907016060862e-05, + "loss": 0.7513, + "step": 3147, + "task_loss": 1.1021108627319336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26984137296676636, + "epoch": 2.66, + "learning_rate": 3.669484361792054e-05, + "loss": 0.7818, + "step": 3148, + "task_loss": 0.33520352840423584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8587289452552795, + "epoch": 2.66, + "learning_rate": 3.669061707523247e-05, + "loss": 0.8354, + "step": 3149, + "task_loss": 0.7077796459197998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.590491533279419, + "epoch": 2.66, + "learning_rate": 3.668639053254438e-05, + "loss": 0.6469, + "step": 3150, + "task_loss": 1.1975209712982178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8266935348510742, + "epoch": 2.66, + "learning_rate": 3.66821639898563e-05, + "loss": 0.863, + "step": 3151, + "task_loss": 0.990756630897522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.7702124118804932, + "epoch": 2.66, + "learning_rate": 3.667793744716822e-05, + "loss": 0.9335, + "step": 3152, + "task_loss": 1.1748255491256714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5679048299789429, + "epoch": 2.66, + "learning_rate": 3.667371090448013e-05, + "loss": 0.5897, + "step": 3153, + "task_loss": 0.6763615012168884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6074539422988892, + "epoch": 2.67, + "learning_rate": 3.666948436179205e-05, + "loss": 0.6592, + "step": 3154, + "task_loss": 0.31726667284965515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5907617211341858, + "epoch": 2.67, + "learning_rate": 3.666525781910398e-05, + "loss": 0.9045, + "step": 3155, + "task_loss": 0.6866695880889893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6058465242385864, + "epoch": 2.67, + "learning_rate": 3.666103127641589e-05, + "loss": 0.9728, + "step": 3156, + "task_loss": 0.6506273746490479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5541400909423828, + "epoch": 2.67, + "learning_rate": 3.665680473372781e-05, + "loss": 0.5683, + "step": 3157, + "task_loss": 0.1370454728603363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8852889537811279, + "epoch": 2.67, + "learning_rate": 3.665257819103973e-05, + "loss": 1.1754, + "step": 3158, + "task_loss": 1.3376331329345703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0697664022445679, + "epoch": 2.67, + "learning_rate": 3.6648351648351644e-05, + "loss": 0.8793, + "step": 3159, + "task_loss": 1.8662017583847046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9096803665161133, + "epoch": 2.67, + "learning_rate": 3.664412510566357e-05, + "loss": 0.9165, + "step": 3160, + "task_loss": 0.8790223002433777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2309272289276123, + "epoch": 2.67, + "learning_rate": 3.663989856297549e-05, + "loss": 0.7827, + "step": 3161, + "task_loss": 1.2024024724960327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8758441209793091, + "epoch": 2.67, + "learning_rate": 3.663567202028741e-05, + "loss": 0.7908, + "step": 3162, + "task_loss": 0.8387846946716309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1439201831817627, + "epoch": 2.67, + "learning_rate": 3.6631445477599323e-05, + "loss": 0.6686, + "step": 3163, + "task_loss": 1.2961164712905884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6691940426826477, + "epoch": 2.67, + "learning_rate": 3.662721893491124e-05, + "loss": 0.5438, + "step": 3164, + "task_loss": 1.4465086460113525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6842027306556702, + "epoch": 2.67, + "learning_rate": 3.662299239222316e-05, + "loss": 0.5976, + "step": 3165, + "task_loss": 1.9269887208938599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8079041242599487, + "epoch": 2.68, + "learning_rate": 3.661876584953508e-05, + "loss": 0.6748, + "step": 3166, + "task_loss": 1.124586582183838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0641887187957764, + "epoch": 2.68, + "learning_rate": 3.6614539306847e-05, + "loss": 0.8475, + "step": 3167, + "task_loss": 1.0529060363769531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5148777365684509, + "epoch": 2.68, + "learning_rate": 3.661031276415892e-05, + "loss": 0.8851, + "step": 3168, + "task_loss": 0.16598300635814667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9191156625747681, + "epoch": 2.68, + "learning_rate": 3.6606086221470835e-05, + "loss": 0.946, + "step": 3169, + "task_loss": 0.8727797865867615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.206129550933838, + "epoch": 2.68, + "learning_rate": 3.6601859678782755e-05, + "loss": 0.9677, + "step": 3170, + "task_loss": 1.1366856098175049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.522931694984436, + "epoch": 2.68, + "learning_rate": 3.6597633136094675e-05, + "loss": 0.8306, + "step": 3171, + "task_loss": 0.7132547497749329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8306045532226562, + "epoch": 2.68, + "learning_rate": 3.6593406593406594e-05, + "loss": 0.874, + "step": 3172, + "task_loss": 0.5333722829818726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4200117588043213, + "epoch": 2.68, + "learning_rate": 3.6589180050718514e-05, + "loss": 0.6628, + "step": 3173, + "task_loss": 0.33211979269981384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6182844638824463, + "epoch": 2.68, + "learning_rate": 3.6584953508030434e-05, + "loss": 0.7342, + "step": 3174, + "task_loss": 1.420999526977539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8615784645080566, + "epoch": 2.68, + "learning_rate": 3.658072696534235e-05, + "loss": 0.6684, + "step": 3175, + "task_loss": 1.2398861646652222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35819321870803833, + "epoch": 2.68, + "learning_rate": 3.6576500422654266e-05, + "loss": 0.6991, + "step": 3176, + "task_loss": 0.8791791200637817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2637050747871399, + "epoch": 2.69, + "learning_rate": 3.657227387996619e-05, + "loss": 0.8037, + "step": 3177, + "task_loss": 0.8369206786155701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.871681809425354, + "epoch": 2.69, + "learning_rate": 3.656804733727811e-05, + "loss": 0.943, + "step": 3178, + "task_loss": 2.086747407913208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7784783840179443, + "epoch": 2.69, + "learning_rate": 3.6563820794590026e-05, + "loss": 0.8126, + "step": 3179, + "task_loss": 1.2934685945510864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6373320817947388, + "epoch": 2.69, + "learning_rate": 3.6559594251901945e-05, + "loss": 0.6403, + "step": 3180, + "task_loss": 0.36190930008888245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7173376083374023, + "epoch": 2.69, + "learning_rate": 3.6555367709213865e-05, + "loss": 0.8833, + "step": 3181, + "task_loss": 0.8949460983276367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46418094635009766, + "epoch": 2.69, + "learning_rate": 3.6551141166525785e-05, + "loss": 0.8114, + "step": 3182, + "task_loss": 0.15461945533752441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5413683652877808, + "epoch": 2.69, + "learning_rate": 3.6546914623837705e-05, + "loss": 0.6256, + "step": 3183, + "task_loss": 0.36658158898353577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.017490029335022, + "epoch": 2.69, + "learning_rate": 3.6542688081149624e-05, + "loss": 0.6705, + "step": 3184, + "task_loss": 0.505678117275238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6262437701225281, + "epoch": 2.69, + "learning_rate": 3.653846153846154e-05, + "loss": 0.6796, + "step": 3185, + "task_loss": 0.4420837461948395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7411134839057922, + "epoch": 2.69, + "learning_rate": 3.653423499577346e-05, + "loss": 0.5585, + "step": 3186, + "task_loss": 0.5205428004264832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8411450386047363, + "epoch": 2.69, + "learning_rate": 3.653000845308538e-05, + "loss": 0.8477, + "step": 3187, + "task_loss": 0.38130441308021545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7072612643241882, + "epoch": 2.69, + "learning_rate": 3.6525781910397297e-05, + "loss": 0.8493, + "step": 3188, + "task_loss": 1.6934208869934082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8371607065200806, + "epoch": 2.7, + "learning_rate": 3.6521555367709216e-05, + "loss": 0.7935, + "step": 3189, + "task_loss": 0.3760598301887512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7347347736358643, + "epoch": 2.7, + "learning_rate": 3.6517328825021136e-05, + "loss": 0.6921, + "step": 3190, + "task_loss": 0.7482430934906006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.421387255191803, + "epoch": 2.7, + "learning_rate": 3.6513102282333056e-05, + "loss": 0.703, + "step": 3191, + "task_loss": 0.4672171175479889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44681286811828613, + "epoch": 2.7, + "learning_rate": 3.650887573964497e-05, + "loss": 0.8332, + "step": 3192, + "task_loss": 0.6652718782424927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8775023818016052, + "epoch": 2.7, + "learning_rate": 3.650464919695689e-05, + "loss": 0.8674, + "step": 3193, + "task_loss": 1.6609073877334595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8900438547134399, + "epoch": 2.7, + "learning_rate": 3.6500422654268815e-05, + "loss": 0.8038, + "step": 3194, + "task_loss": 0.7634229063987732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2499973773956299, + "epoch": 2.7, + "learning_rate": 3.649619611158073e-05, + "loss": 0.7465, + "step": 3195, + "task_loss": 1.215793490409851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9322706460952759, + "epoch": 2.7, + "learning_rate": 3.649196956889265e-05, + "loss": 0.7743, + "step": 3196, + "task_loss": 0.750275194644928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9437536001205444, + "epoch": 2.7, + "learning_rate": 3.648774302620457e-05, + "loss": 0.7276, + "step": 3197, + "task_loss": 0.6326310038566589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7384657263755798, + "epoch": 2.7, + "learning_rate": 3.648351648351648e-05, + "loss": 0.7106, + "step": 3198, + "task_loss": 0.2647033631801605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5414721965789795, + "epoch": 2.7, + "learning_rate": 3.647928994082841e-05, + "loss": 0.6224, + "step": 3199, + "task_loss": 0.3809734880924225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.729843020439148, + "epoch": 2.7, + "learning_rate": 3.647506339814033e-05, + "loss": 0.6386, + "step": 3200, + "task_loss": 1.819659948348999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34137946367263794, + "epoch": 2.71, + "learning_rate": 3.647083685545224e-05, + "loss": 0.6281, + "step": 3201, + "task_loss": 1.0081596374511719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7048791646957397, + "epoch": 2.71, + "learning_rate": 3.646661031276416e-05, + "loss": 0.749, + "step": 3202, + "task_loss": 1.5170516967773438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4699389636516571, + "epoch": 2.71, + "learning_rate": 3.646238377007608e-05, + "loss": 0.6961, + "step": 3203, + "task_loss": 0.861007809638977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7379894256591797, + "epoch": 2.71, + "learning_rate": 3.6458157227388e-05, + "loss": 0.6025, + "step": 3204, + "task_loss": 0.4677521288394928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6614158749580383, + "epoch": 2.71, + "learning_rate": 3.645393068469992e-05, + "loss": 0.6066, + "step": 3205, + "task_loss": 0.6612241268157959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.754846453666687, + "epoch": 2.71, + "learning_rate": 3.644970414201184e-05, + "loss": 0.9757, + "step": 3206, + "task_loss": 1.5634241104125977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3605467081069946, + "epoch": 2.71, + "learning_rate": 3.644547759932376e-05, + "loss": 0.9866, + "step": 3207, + "task_loss": 0.7468777894973755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5064225196838379, + "epoch": 2.71, + "learning_rate": 3.644125105663567e-05, + "loss": 0.777, + "step": 3208, + "task_loss": 0.5832480192184448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1239912509918213, + "epoch": 2.71, + "learning_rate": 3.643702451394759e-05, + "loss": 0.9966, + "step": 3209, + "task_loss": 2.2184693813323975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5373660326004028, + "epoch": 2.71, + "learning_rate": 3.643279797125951e-05, + "loss": 0.6815, + "step": 3210, + "task_loss": 0.12213116139173508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.779929518699646, + "epoch": 2.71, + "learning_rate": 3.642857142857143e-05, + "loss": 0.7837, + "step": 3211, + "task_loss": 0.4112778902053833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9307505488395691, + "epoch": 2.71, + "learning_rate": 3.642434488588335e-05, + "loss": 0.9188, + "step": 3212, + "task_loss": 0.6986552476882935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1074609756469727, + "epoch": 2.72, + "learning_rate": 3.642011834319527e-05, + "loss": 0.7545, + "step": 3213, + "task_loss": 0.7192916870117188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.714178204536438, + "epoch": 2.72, + "learning_rate": 3.641589180050718e-05, + "loss": 0.8383, + "step": 3214, + "task_loss": 0.8756546378135681 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1989495754241943, + "epoch": 2.72, + "learning_rate": 3.64116652578191e-05, + "loss": 0.8403, + "step": 3215, + "task_loss": 0.739342451095581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8341965079307556, + "epoch": 2.72, + "learning_rate": 3.640743871513103e-05, + "loss": 0.8019, + "step": 3216, + "task_loss": 0.7803371548652649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3988834023475647, + "epoch": 2.72, + "learning_rate": 3.640321217244294e-05, + "loss": 0.6493, + "step": 3217, + "task_loss": 0.3000013828277588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3551662862300873, + "epoch": 2.72, + "learning_rate": 3.639898562975486e-05, + "loss": 0.8931, + "step": 3218, + "task_loss": 0.38305312395095825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3575408458709717, + "epoch": 2.72, + "learning_rate": 3.639475908706678e-05, + "loss": 0.8913, + "step": 3219, + "task_loss": 0.46277594566345215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7194536924362183, + "epoch": 2.72, + "learning_rate": 3.63905325443787e-05, + "loss": 0.8869, + "step": 3220, + "task_loss": 0.46771782636642456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5157691240310669, + "epoch": 2.72, + "learning_rate": 3.638630600169062e-05, + "loss": 0.5827, + "step": 3221, + "task_loss": 0.5627987384796143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.763983428478241, + "epoch": 2.72, + "learning_rate": 3.638207945900254e-05, + "loss": 0.6576, + "step": 3222, + "task_loss": 1.1351430416107178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0026105642318726, + "epoch": 2.72, + "learning_rate": 3.637785291631446e-05, + "loss": 0.821, + "step": 3223, + "task_loss": 1.5181260108947754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0317902565002441, + "epoch": 2.72, + "learning_rate": 3.637362637362637e-05, + "loss": 1.0015, + "step": 3224, + "task_loss": 1.0426018238067627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7107886075973511, + "epoch": 2.73, + "learning_rate": 3.636939983093829e-05, + "loss": 0.7966, + "step": 3225, + "task_loss": 0.211040198802948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9026594161987305, + "epoch": 2.73, + "learning_rate": 3.636517328825021e-05, + "loss": 0.9178, + "step": 3226, + "task_loss": 1.6117362976074219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0398650169372559, + "epoch": 2.73, + "learning_rate": 3.636094674556213e-05, + "loss": 0.8409, + "step": 3227, + "task_loss": 1.1032884120941162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7302571535110474, + "epoch": 2.73, + "learning_rate": 3.635672020287405e-05, + "loss": 1.0826, + "step": 3228, + "task_loss": 0.989496648311615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4171563982963562, + "epoch": 2.73, + "learning_rate": 3.635249366018597e-05, + "loss": 0.6483, + "step": 3229, + "task_loss": 0.3753688335418701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5672946572303772, + "epoch": 2.73, + "learning_rate": 3.6348267117497885e-05, + "loss": 0.8951, + "step": 3230, + "task_loss": 1.9011656045913696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8212553858757019, + "epoch": 2.73, + "learning_rate": 3.6344040574809805e-05, + "loss": 0.8339, + "step": 3231, + "task_loss": 0.9959295988082886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.012403130531311, + "epoch": 2.73, + "learning_rate": 3.6339814032121724e-05, + "loss": 0.8895, + "step": 3232, + "task_loss": 1.1733182668685913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5102289915084839, + "epoch": 2.73, + "learning_rate": 3.6335587489433644e-05, + "loss": 0.6307, + "step": 3233, + "task_loss": 0.38015079498291016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6747186779975891, + "epoch": 2.73, + "learning_rate": 3.6331360946745564e-05, + "loss": 0.7663, + "step": 3234, + "task_loss": 0.34005922079086304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8523671627044678, + "epoch": 2.73, + "learning_rate": 3.6327134404057484e-05, + "loss": 0.913, + "step": 3235, + "task_loss": 0.8009073734283447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8001285791397095, + "epoch": 2.73, + "learning_rate": 3.63229078613694e-05, + "loss": 0.7568, + "step": 3236, + "task_loss": 0.5754767656326294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5291897058486938, + "epoch": 2.74, + "learning_rate": 3.6318681318681316e-05, + "loss": 0.5505, + "step": 3237, + "task_loss": 0.32463839650154114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.691354513168335, + "epoch": 2.74, + "learning_rate": 3.631445477599324e-05, + "loss": 0.71, + "step": 3238, + "task_loss": 0.7392473816871643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5998837947845459, + "epoch": 2.74, + "learning_rate": 3.631022823330516e-05, + "loss": 0.6833, + "step": 3239, + "task_loss": 0.45310693979263306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5507205128669739, + "epoch": 2.74, + "learning_rate": 3.6306001690617076e-05, + "loss": 0.7251, + "step": 3240, + "task_loss": 1.869431972503662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6730242967605591, + "epoch": 2.74, + "learning_rate": 3.6301775147928995e-05, + "loss": 0.6456, + "step": 3241, + "task_loss": 0.3893422484397888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7975687980651855, + "epoch": 2.74, + "learning_rate": 3.6297548605240915e-05, + "loss": 0.6181, + "step": 3242, + "task_loss": 0.5642158389091492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4054068922996521, + "epoch": 2.74, + "learning_rate": 3.6293322062552835e-05, + "loss": 0.5695, + "step": 3243, + "task_loss": 0.4393438696861267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1712977886199951, + "epoch": 2.74, + "learning_rate": 3.6289095519864754e-05, + "loss": 1.0302, + "step": 3244, + "task_loss": 0.899411141872406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5320630073547363, + "epoch": 2.74, + "learning_rate": 3.6284868977176674e-05, + "loss": 0.8687, + "step": 3245, + "task_loss": 0.998672366142273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9773420095443726, + "epoch": 2.74, + "learning_rate": 3.628064243448859e-05, + "loss": 0.8313, + "step": 3246, + "task_loss": 1.0046602487564087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0655792951583862, + "epoch": 2.74, + "learning_rate": 3.627641589180051e-05, + "loss": 0.8468, + "step": 3247, + "task_loss": 0.8440394997596741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7294139266014099, + "epoch": 2.75, + "learning_rate": 3.627218934911243e-05, + "loss": 0.7729, + "step": 3248, + "task_loss": 0.6787798404693604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6052408218383789, + "epoch": 2.75, + "learning_rate": 3.6267962806424346e-05, + "loss": 0.6962, + "step": 3249, + "task_loss": 0.303617388010025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7759650349617004, + "epoch": 2.75, + "learning_rate": 3.6263736263736266e-05, + "loss": 0.6636, + "step": 3250, + "task_loss": 0.6617615818977356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1901512145996094, + "epoch": 2.75, + "learning_rate": 3.6259509721048186e-05, + "loss": 0.9352, + "step": 3251, + "task_loss": 0.8724523782730103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2492327690124512, + "epoch": 2.75, + "learning_rate": 3.6255283178360106e-05, + "loss": 0.9276, + "step": 3252, + "task_loss": 1.086970329284668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43723583221435547, + "epoch": 2.75, + "learning_rate": 3.625105663567202e-05, + "loss": 0.702, + "step": 3253, + "task_loss": 0.5267838835716248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8055061101913452, + "epoch": 2.75, + "learning_rate": 3.624683009298394e-05, + "loss": 0.8433, + "step": 3254, + "task_loss": 1.1333932876586914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2522903680801392, + "epoch": 2.75, + "learning_rate": 3.6242603550295865e-05, + "loss": 0.9854, + "step": 3255, + "task_loss": 0.5626524090766907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.663620114326477, + "epoch": 2.75, + "learning_rate": 3.623837700760778e-05, + "loss": 0.721, + "step": 3256, + "task_loss": 1.1029486656188965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6475971937179565, + "epoch": 2.75, + "learning_rate": 3.62341504649197e-05, + "loss": 0.5288, + "step": 3257, + "task_loss": 0.6983718872070312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8854166865348816, + "epoch": 2.75, + "learning_rate": 3.622992392223162e-05, + "loss": 0.7067, + "step": 3258, + "task_loss": 0.9810651540756226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7401001453399658, + "epoch": 2.75, + "learning_rate": 3.622569737954353e-05, + "loss": 0.8089, + "step": 3259, + "task_loss": 1.2944881916046143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0155483484268188, + "epoch": 2.76, + "learning_rate": 3.622147083685546e-05, + "loss": 0.8513, + "step": 3260, + "task_loss": 0.6485295295715332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.96612548828125, + "epoch": 2.76, + "learning_rate": 3.6217244294167376e-05, + "loss": 0.8414, + "step": 3261, + "task_loss": 0.5453760027885437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9183001518249512, + "epoch": 2.76, + "learning_rate": 3.621301775147929e-05, + "loss": 0.7371, + "step": 3262, + "task_loss": 1.088436484336853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5886558294296265, + "epoch": 2.76, + "learning_rate": 3.620879120879121e-05, + "loss": 0.6881, + "step": 3263, + "task_loss": 0.9603484272956848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5060992240905762, + "epoch": 2.76, + "learning_rate": 3.620456466610313e-05, + "loss": 0.6228, + "step": 3264, + "task_loss": 0.31005531549453735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8019952178001404, + "epoch": 2.76, + "learning_rate": 3.620033812341505e-05, + "loss": 0.666, + "step": 3265, + "task_loss": 1.4603077173233032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7681384086608887, + "epoch": 2.76, + "learning_rate": 3.619611158072697e-05, + "loss": 0.8473, + "step": 3266, + "task_loss": 0.13752584159374237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7811859846115112, + "epoch": 2.76, + "learning_rate": 3.619188503803889e-05, + "loss": 0.7979, + "step": 3267, + "task_loss": 0.6954386234283447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1156175136566162, + "epoch": 2.76, + "learning_rate": 3.618765849535081e-05, + "loss": 0.9066, + "step": 3268, + "task_loss": 1.5043240785598755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1991462707519531, + "epoch": 2.76, + "learning_rate": 3.618343195266272e-05, + "loss": 0.9056, + "step": 3269, + "task_loss": 1.1476621627807617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7136577367782593, + "epoch": 2.76, + "learning_rate": 3.617920540997464e-05, + "loss": 0.8466, + "step": 3270, + "task_loss": 0.47285696864128113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7878576517105103, + "epoch": 2.76, + "learning_rate": 3.617497886728656e-05, + "loss": 0.9725, + "step": 3271, + "task_loss": 0.29310181736946106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8063193559646606, + "epoch": 2.77, + "learning_rate": 3.617075232459848e-05, + "loss": 0.6592, + "step": 3272, + "task_loss": 0.8167861700057983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5706242918968201, + "epoch": 2.77, + "learning_rate": 3.61665257819104e-05, + "loss": 0.6352, + "step": 3273, + "task_loss": 1.3761039972305298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6302134990692139, + "epoch": 2.77, + "learning_rate": 3.616229923922232e-05, + "loss": 0.616, + "step": 3274, + "task_loss": 0.3765747547149658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47885486483573914, + "epoch": 2.77, + "learning_rate": 3.615807269653423e-05, + "loss": 0.6738, + "step": 3275, + "task_loss": 0.09418342262506485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8178308606147766, + "epoch": 2.77, + "learning_rate": 3.615384615384615e-05, + "loss": 0.8208, + "step": 3276, + "task_loss": 1.2406654357910156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7907090187072754, + "epoch": 2.77, + "learning_rate": 3.614961961115808e-05, + "loss": 0.7618, + "step": 3277, + "task_loss": 0.3515041172504425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.551092803478241, + "epoch": 2.77, + "learning_rate": 3.614539306847e-05, + "loss": 0.6284, + "step": 3278, + "task_loss": 0.6256594657897949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7914831638336182, + "epoch": 2.77, + "learning_rate": 3.614116652578191e-05, + "loss": 0.7499, + "step": 3279, + "task_loss": 1.0081613063812256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8618583679199219, + "epoch": 2.77, + "learning_rate": 3.613693998309383e-05, + "loss": 1.0505, + "step": 3280, + "task_loss": 0.5456560254096985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6455854177474976, + "epoch": 2.77, + "learning_rate": 3.613271344040575e-05, + "loss": 0.8675, + "step": 3281, + "task_loss": 1.8859734535217285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7248468399047852, + "epoch": 2.77, + "learning_rate": 3.6128486897717664e-05, + "loss": 0.9598, + "step": 3282, + "task_loss": 1.896436095237732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39074623584747314, + "epoch": 2.77, + "learning_rate": 3.612426035502959e-05, + "loss": 0.709, + "step": 3283, + "task_loss": 0.208869069814682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5402849316596985, + "epoch": 2.78, + "learning_rate": 3.612003381234151e-05, + "loss": 0.7581, + "step": 3284, + "task_loss": 0.8791276216506958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6216833591461182, + "epoch": 2.78, + "learning_rate": 3.611580726965342e-05, + "loss": 0.9744, + "step": 3285, + "task_loss": 0.6507049202919006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9934566020965576, + "epoch": 2.78, + "learning_rate": 3.611158072696534e-05, + "loss": 1.1314, + "step": 3286, + "task_loss": 0.9094664454460144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0117695331573486, + "epoch": 2.78, + "learning_rate": 3.610735418427726e-05, + "loss": 0.7291, + "step": 3287, + "task_loss": 0.9870509505271912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5655584335327148, + "epoch": 2.78, + "learning_rate": 3.610312764158918e-05, + "loss": 0.8105, + "step": 3288, + "task_loss": 0.7977146506309509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6419283151626587, + "epoch": 2.78, + "learning_rate": 3.60989010989011e-05, + "loss": 0.7212, + "step": 3289, + "task_loss": 0.22209587693214417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7782756090164185, + "epoch": 2.78, + "learning_rate": 3.609467455621302e-05, + "loss": 0.7048, + "step": 3290, + "task_loss": 0.5902601480484009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7808916568756104, + "epoch": 2.78, + "learning_rate": 3.6090448013524935e-05, + "loss": 0.9701, + "step": 3291, + "task_loss": 0.38256022334098816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.695368766784668, + "epoch": 2.78, + "learning_rate": 3.6086221470836855e-05, + "loss": 0.7582, + "step": 3292, + "task_loss": 0.7444636821746826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5389489531517029, + "epoch": 2.78, + "learning_rate": 3.6081994928148774e-05, + "loss": 0.7329, + "step": 3293, + "task_loss": 0.7681195139884949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.747118353843689, + "epoch": 2.78, + "learning_rate": 3.6077768385460694e-05, + "loss": 0.7452, + "step": 3294, + "task_loss": 0.6000422239303589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5363589525222778, + "epoch": 2.78, + "learning_rate": 3.6073541842772614e-05, + "loss": 0.7354, + "step": 3295, + "task_loss": 0.9708808660507202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.915900707244873, + "epoch": 2.79, + "learning_rate": 3.6069315300084533e-05, + "loss": 0.7257, + "step": 3296, + "task_loss": 0.6956742405891418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0911142826080322, + "epoch": 2.79, + "learning_rate": 3.606508875739645e-05, + "loss": 0.7316, + "step": 3297, + "task_loss": 1.4301362037658691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8773866295814514, + "epoch": 2.79, + "learning_rate": 3.6060862214708366e-05, + "loss": 0.6273, + "step": 3298, + "task_loss": 0.9789811968803406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6220656633377075, + "epoch": 2.79, + "learning_rate": 3.6056635672020286e-05, + "loss": 0.7831, + "step": 3299, + "task_loss": 0.2030237913131714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38913747668266296, + "epoch": 2.79, + "learning_rate": 3.605240912933221e-05, + "loss": 0.745, + "step": 3300, + "task_loss": 1.2848995923995972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6990540027618408, + "epoch": 2.79, + "learning_rate": 3.6048182586644125e-05, + "loss": 0.9037, + "step": 3301, + "task_loss": 0.8262940645217896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7512128353118896, + "epoch": 2.79, + "learning_rate": 3.6043956043956045e-05, + "loss": 0.8033, + "step": 3302, + "task_loss": 0.40923023223876953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7678865194320679, + "epoch": 2.79, + "learning_rate": 3.6039729501267965e-05, + "loss": 0.9258, + "step": 3303, + "task_loss": 0.954060435295105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0199294090270996, + "epoch": 2.79, + "learning_rate": 3.603550295857988e-05, + "loss": 0.9878, + "step": 3304, + "task_loss": 1.248643398284912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45711737871170044, + "epoch": 2.79, + "learning_rate": 3.6031276415891804e-05, + "loss": 0.8528, + "step": 3305, + "task_loss": 0.7803612351417542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.432719886302948, + "epoch": 2.79, + "learning_rate": 3.6027049873203724e-05, + "loss": 0.5504, + "step": 3306, + "task_loss": 0.5674700736999512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6896347999572754, + "epoch": 2.79, + "learning_rate": 3.6022823330515644e-05, + "loss": 0.7025, + "step": 3307, + "task_loss": 0.706199586391449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5712430477142334, + "epoch": 2.8, + "learning_rate": 3.601859678782756e-05, + "loss": 0.7422, + "step": 3308, + "task_loss": 0.8810210824012756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.841557502746582, + "epoch": 2.8, + "learning_rate": 3.6014370245139477e-05, + "loss": 0.9066, + "step": 3309, + "task_loss": 1.0013234615325928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.673639178276062, + "epoch": 2.8, + "learning_rate": 3.6010143702451396e-05, + "loss": 0.7519, + "step": 3310, + "task_loss": 1.6614902019500732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0699958801269531, + "epoch": 2.8, + "learning_rate": 3.6005917159763316e-05, + "loss": 0.8013, + "step": 3311, + "task_loss": 1.4993906021118164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.025099515914917, + "epoch": 2.8, + "learning_rate": 3.6001690617075236e-05, + "loss": 0.84, + "step": 3312, + "task_loss": 0.6921936273574829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5464345216751099, + "epoch": 2.8, + "learning_rate": 3.5997464074387155e-05, + "loss": 0.5862, + "step": 3313, + "task_loss": 0.3845492899417877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7522363662719727, + "epoch": 2.8, + "learning_rate": 3.599323753169907e-05, + "loss": 0.7042, + "step": 3314, + "task_loss": 1.099902629852295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5170813202857971, + "epoch": 2.8, + "learning_rate": 3.598901098901099e-05, + "loss": 0.5995, + "step": 3315, + "task_loss": 0.14206846058368683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8377299308776855, + "epoch": 2.8, + "learning_rate": 3.598478444632291e-05, + "loss": 0.8144, + "step": 3316, + "task_loss": 0.6457552909851074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7462509274482727, + "epoch": 2.8, + "learning_rate": 3.598055790363483e-05, + "loss": 0.6059, + "step": 3317, + "task_loss": 0.8252855539321899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3696131110191345, + "epoch": 2.8, + "learning_rate": 3.597633136094675e-05, + "loss": 0.7073, + "step": 3318, + "task_loss": 0.29601961374282837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3777548670768738, + "epoch": 2.81, + "learning_rate": 3.597210481825867e-05, + "loss": 0.6954, + "step": 3319, + "task_loss": 0.8024699687957764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9105001091957092, + "epoch": 2.81, + "learning_rate": 3.596787827557058e-05, + "loss": 0.6867, + "step": 3320, + "task_loss": 0.9338986277580261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5200917720794678, + "epoch": 2.81, + "learning_rate": 3.59636517328825e-05, + "loss": 0.8007, + "step": 3321, + "task_loss": 0.6254926919937134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20663976669311523, + "epoch": 2.81, + "learning_rate": 3.5959425190194426e-05, + "loss": 0.5437, + "step": 3322, + "task_loss": 0.04157170653343201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5326728820800781, + "epoch": 2.81, + "learning_rate": 3.5955198647506346e-05, + "loss": 0.5746, + "step": 3323, + "task_loss": 0.3979988694190979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6816906332969666, + "epoch": 2.81, + "learning_rate": 3.595097210481826e-05, + "loss": 0.8078, + "step": 3324, + "task_loss": 0.4956477880477905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7264906764030457, + "epoch": 2.81, + "learning_rate": 3.594674556213018e-05, + "loss": 0.706, + "step": 3325, + "task_loss": 0.3928796052932739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7272849082946777, + "epoch": 2.81, + "learning_rate": 3.59425190194421e-05, + "loss": 0.6891, + "step": 3326, + "task_loss": 0.6252204179763794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22119112312793732, + "epoch": 2.81, + "learning_rate": 3.593829247675402e-05, + "loss": 0.5689, + "step": 3327, + "task_loss": 0.45182284712791443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8769271373748779, + "epoch": 2.81, + "learning_rate": 3.593406593406594e-05, + "loss": 0.6481, + "step": 3328, + "task_loss": 0.7757990956306458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5670448541641235, + "epoch": 2.81, + "learning_rate": 3.592983939137786e-05, + "loss": 0.7003, + "step": 3329, + "task_loss": 1.3578851222991943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5573376417160034, + "epoch": 2.81, + "learning_rate": 3.592561284868977e-05, + "loss": 0.9923, + "step": 3330, + "task_loss": 0.48840057849884033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5479972958564758, + "epoch": 2.82, + "learning_rate": 3.592138630600169e-05, + "loss": 0.757, + "step": 3331, + "task_loss": 0.4727529287338257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.484941840171814, + "epoch": 2.82, + "learning_rate": 3.591715976331361e-05, + "loss": 0.8129, + "step": 3332, + "task_loss": 1.3712893724441528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9272233247756958, + "epoch": 2.82, + "learning_rate": 3.591293322062553e-05, + "loss": 0.8068, + "step": 3333, + "task_loss": 1.2493752241134644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7020160555839539, + "epoch": 2.82, + "learning_rate": 3.590870667793745e-05, + "loss": 0.6155, + "step": 3334, + "task_loss": 0.7037437558174133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7469232678413391, + "epoch": 2.82, + "learning_rate": 3.590448013524937e-05, + "loss": 0.6887, + "step": 3335, + "task_loss": 0.8930742740631104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22888050973415375, + "epoch": 2.82, + "learning_rate": 3.590025359256129e-05, + "loss": 0.7191, + "step": 3336, + "task_loss": 0.0376892127096653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7355809211730957, + "epoch": 2.82, + "learning_rate": 3.58960270498732e-05, + "loss": 0.8127, + "step": 3337, + "task_loss": 0.6290755271911621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6525412797927856, + "epoch": 2.82, + "learning_rate": 3.589180050718512e-05, + "loss": 0.9723, + "step": 3338, + "task_loss": 0.6600795984268188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.577083945274353, + "epoch": 2.82, + "learning_rate": 3.588757396449705e-05, + "loss": 0.7216, + "step": 3339, + "task_loss": 0.5928191542625427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4327682554721832, + "epoch": 2.82, + "learning_rate": 3.588334742180896e-05, + "loss": 0.6027, + "step": 3340, + "task_loss": 0.6199809312820435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29940110445022583, + "epoch": 2.82, + "learning_rate": 3.587912087912088e-05, + "loss": 0.7863, + "step": 3341, + "task_loss": 0.24301491677761078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9664512872695923, + "epoch": 2.82, + "learning_rate": 3.58748943364328e-05, + "loss": 0.7198, + "step": 3342, + "task_loss": 1.0228712558746338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4342517852783203, + "epoch": 2.83, + "learning_rate": 3.5870667793744714e-05, + "loss": 0.9083, + "step": 3343, + "task_loss": 0.7765054702758789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2263702005147934, + "epoch": 2.83, + "learning_rate": 3.586644125105664e-05, + "loss": 0.5907, + "step": 3344, + "task_loss": 0.33594268560409546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7420411109924316, + "epoch": 2.83, + "learning_rate": 3.586221470836856e-05, + "loss": 0.7812, + "step": 3345, + "task_loss": 0.3694157898426056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.818916916847229, + "epoch": 2.83, + "learning_rate": 3.585798816568047e-05, + "loss": 0.7194, + "step": 3346, + "task_loss": 0.5279682278633118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3660365343093872, + "epoch": 2.83, + "learning_rate": 3.585376162299239e-05, + "loss": 0.601, + "step": 3347, + "task_loss": 0.21023991703987122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8920690417289734, + "epoch": 2.83, + "learning_rate": 3.584953508030431e-05, + "loss": 0.678, + "step": 3348, + "task_loss": 1.6149805784225464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.581025242805481, + "epoch": 2.83, + "learning_rate": 3.584530853761623e-05, + "loss": 0.6872, + "step": 3349, + "task_loss": 1.0851932764053345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9683977365493774, + "epoch": 2.83, + "learning_rate": 3.584108199492815e-05, + "loss": 0.6892, + "step": 3350, + "task_loss": 1.5431760549545288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6526146531105042, + "epoch": 2.83, + "learning_rate": 3.583685545224007e-05, + "loss": 0.8414, + "step": 3351, + "task_loss": 1.0233675241470337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6186125874519348, + "epoch": 2.83, + "learning_rate": 3.583262890955199e-05, + "loss": 0.6547, + "step": 3352, + "task_loss": 1.1729212999343872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9383249282836914, + "epoch": 2.83, + "learning_rate": 3.5828402366863904e-05, + "loss": 0.6468, + "step": 3353, + "task_loss": 0.399311900138855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6464821696281433, + "epoch": 2.83, + "learning_rate": 3.5824175824175824e-05, + "loss": 0.8103, + "step": 3354, + "task_loss": 1.1796218156814575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1304669380187988, + "epoch": 2.84, + "learning_rate": 3.5819949281487744e-05, + "loss": 0.8737, + "step": 3355, + "task_loss": 1.3280930519104004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9701012372970581, + "epoch": 2.84, + "learning_rate": 3.5815722738799664e-05, + "loss": 0.8338, + "step": 3356, + "task_loss": 0.9874003529548645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5631482601165771, + "epoch": 2.84, + "learning_rate": 3.581149619611158e-05, + "loss": 0.7308, + "step": 3357, + "task_loss": 0.5956993103027344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5462734699249268, + "epoch": 2.84, + "learning_rate": 3.58072696534235e-05, + "loss": 0.5028, + "step": 3358, + "task_loss": 0.6815320253372192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9117316007614136, + "epoch": 2.84, + "learning_rate": 3.5803043110735416e-05, + "loss": 0.9044, + "step": 3359, + "task_loss": 1.6148408651351929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7821139097213745, + "epoch": 2.84, + "learning_rate": 3.5798816568047336e-05, + "loss": 0.9078, + "step": 3360, + "task_loss": 1.0132858753204346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3394583761692047, + "epoch": 2.84, + "learning_rate": 3.579459002535926e-05, + "loss": 0.7167, + "step": 3361, + "task_loss": 0.41234830021858215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4090193510055542, + "epoch": 2.84, + "learning_rate": 3.5790363482671175e-05, + "loss": 0.689, + "step": 3362, + "task_loss": 0.42703545093536377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9072380065917969, + "epoch": 2.84, + "learning_rate": 3.5786136939983095e-05, + "loss": 0.9806, + "step": 3363, + "task_loss": 0.5319318771362305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8753175735473633, + "epoch": 2.84, + "learning_rate": 3.5781910397295015e-05, + "loss": 0.8924, + "step": 3364, + "task_loss": 1.013991355895996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3981405198574066, + "epoch": 2.84, + "learning_rate": 3.5777683854606934e-05, + "loss": 0.7975, + "step": 3365, + "task_loss": 0.9499750733375549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5115039348602295, + "epoch": 2.84, + "learning_rate": 3.5773457311918854e-05, + "loss": 0.6109, + "step": 3366, + "task_loss": 1.3677561283111572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.64418625831604, + "epoch": 2.85, + "learning_rate": 3.5769230769230774e-05, + "loss": 0.9271, + "step": 3367, + "task_loss": 1.1890087127685547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6919095516204834, + "epoch": 2.85, + "learning_rate": 3.5765004226542694e-05, + "loss": 0.7689, + "step": 3368, + "task_loss": 1.3207236528396606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5973398685455322, + "epoch": 2.85, + "learning_rate": 3.576077768385461e-05, + "loss": 0.6389, + "step": 3369, + "task_loss": 0.20922210812568665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.735552191734314, + "epoch": 2.85, + "learning_rate": 3.5756551141166526e-05, + "loss": 0.8304, + "step": 3370, + "task_loss": 0.8373743891716003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6899963617324829, + "epoch": 2.85, + "learning_rate": 3.5752324598478446e-05, + "loss": 0.6597, + "step": 3371, + "task_loss": 0.9547471404075623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0091354846954346, + "epoch": 2.85, + "learning_rate": 3.5748098055790366e-05, + "loss": 0.8593, + "step": 3372, + "task_loss": 0.6888219118118286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5795108079910278, + "epoch": 2.85, + "learning_rate": 3.5743871513102286e-05, + "loss": 0.4736, + "step": 3373, + "task_loss": 0.6050034165382385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7279648184776306, + "epoch": 2.85, + "learning_rate": 3.5739644970414205e-05, + "loss": 0.7487, + "step": 3374, + "task_loss": 0.7485307455062866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7954450845718384, + "epoch": 2.85, + "learning_rate": 3.573541842772612e-05, + "loss": 0.6612, + "step": 3375, + "task_loss": 0.5137763619422913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5496881008148193, + "epoch": 2.85, + "learning_rate": 3.573119188503804e-05, + "loss": 0.709, + "step": 3376, + "task_loss": 1.0196874141693115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0830185413360596, + "epoch": 2.85, + "learning_rate": 3.572696534234996e-05, + "loss": 0.8297, + "step": 3377, + "task_loss": 0.9782475829124451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5371267795562744, + "epoch": 2.85, + "learning_rate": 3.572273879966188e-05, + "loss": 0.5906, + "step": 3378, + "task_loss": 0.5706921815872192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6014994382858276, + "epoch": 2.86, + "learning_rate": 3.57185122569738e-05, + "loss": 0.9173, + "step": 3379, + "task_loss": 0.9167914986610413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6044281721115112, + "epoch": 2.86, + "learning_rate": 3.571428571428572e-05, + "loss": 0.7661, + "step": 3380, + "task_loss": 0.6776968836784363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4123568534851074, + "epoch": 2.86, + "learning_rate": 3.571005917159764e-05, + "loss": 0.6166, + "step": 3381, + "task_loss": 0.21269316971302032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5059281587600708, + "epoch": 2.86, + "learning_rate": 3.570583262890955e-05, + "loss": 0.8724, + "step": 3382, + "task_loss": 0.5654011964797974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6949788331985474, + "epoch": 2.86, + "learning_rate": 3.5701606086221476e-05, + "loss": 0.7245, + "step": 3383, + "task_loss": 0.6039469838142395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1372690200805664, + "epoch": 2.86, + "learning_rate": 3.5697379543533396e-05, + "loss": 0.855, + "step": 3384, + "task_loss": 1.0438274145126343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5677127242088318, + "epoch": 2.86, + "learning_rate": 3.569315300084531e-05, + "loss": 0.8538, + "step": 3385, + "task_loss": 1.0312906503677368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7746845483779907, + "epoch": 2.86, + "learning_rate": 3.568892645815723e-05, + "loss": 0.759, + "step": 3386, + "task_loss": 0.7188562154769897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.209460735321045, + "epoch": 2.86, + "learning_rate": 3.568469991546915e-05, + "loss": 0.8257, + "step": 3387, + "task_loss": 1.9700309038162231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5082601308822632, + "epoch": 2.86, + "learning_rate": 3.568047337278107e-05, + "loss": 0.7571, + "step": 3388, + "task_loss": 0.8950508832931519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6700512170791626, + "epoch": 2.86, + "learning_rate": 3.567624683009299e-05, + "loss": 0.6723, + "step": 3389, + "task_loss": 0.5086400508880615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7176834344863892, + "epoch": 2.87, + "learning_rate": 3.567202028740491e-05, + "loss": 0.6147, + "step": 3390, + "task_loss": 0.8759731650352478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7018980979919434, + "epoch": 2.87, + "learning_rate": 3.566779374471682e-05, + "loss": 0.6998, + "step": 3391, + "task_loss": 1.050652265548706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7234511375427246, + "epoch": 2.87, + "learning_rate": 3.566356720202874e-05, + "loss": 0.8682, + "step": 3392, + "task_loss": 0.8712239265441895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.788881778717041, + "epoch": 2.87, + "learning_rate": 3.565934065934066e-05, + "loss": 0.7917, + "step": 3393, + "task_loss": 1.0008960962295532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8022782802581787, + "epoch": 2.87, + "learning_rate": 3.565511411665258e-05, + "loss": 0.596, + "step": 3394, + "task_loss": 0.4843157231807709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5188448429107666, + "epoch": 2.87, + "learning_rate": 3.56508875739645e-05, + "loss": 0.8625, + "step": 3395, + "task_loss": 0.46342137455940247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7050491571426392, + "epoch": 2.87, + "learning_rate": 3.564666103127642e-05, + "loss": 0.6855, + "step": 3396, + "task_loss": 0.6222397089004517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.710543692111969, + "epoch": 2.87, + "learning_rate": 3.564243448858834e-05, + "loss": 0.7413, + "step": 3397, + "task_loss": 0.44308194518089294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5758408904075623, + "epoch": 2.87, + "learning_rate": 3.563820794590025e-05, + "loss": 0.6583, + "step": 3398, + "task_loss": 0.49602827429771423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6584762334823608, + "epoch": 2.87, + "learning_rate": 3.563398140321217e-05, + "loss": 0.7451, + "step": 3399, + "task_loss": 1.189739465713501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.647689700126648, + "epoch": 2.87, + "learning_rate": 3.56297548605241e-05, + "loss": 0.6564, + "step": 3400, + "task_loss": 1.050286889076233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7365431785583496, + "epoch": 2.87, + "learning_rate": 3.562552831783601e-05, + "loss": 0.5397, + "step": 3401, + "task_loss": 0.9292466044425964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6733627319335938, + "epoch": 2.88, + "learning_rate": 3.562130177514793e-05, + "loss": 0.7334, + "step": 3402, + "task_loss": 0.638867199420929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7490792274475098, + "epoch": 2.88, + "learning_rate": 3.561707523245985e-05, + "loss": 0.8545, + "step": 3403, + "task_loss": 0.9101022481918335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7391672730445862, + "epoch": 2.88, + "learning_rate": 3.5612848689771764e-05, + "loss": 0.6606, + "step": 3404, + "task_loss": 1.180227518081665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5787700414657593, + "epoch": 2.88, + "learning_rate": 3.560862214708368e-05, + "loss": 0.6409, + "step": 3405, + "task_loss": 0.5392124652862549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.558342456817627, + "epoch": 2.88, + "learning_rate": 3.560439560439561e-05, + "loss": 0.6974, + "step": 3406, + "task_loss": 0.6289198398590088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5504965782165527, + "epoch": 2.88, + "learning_rate": 3.560016906170752e-05, + "loss": 0.7247, + "step": 3407, + "task_loss": 0.9255884885787964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7997416257858276, + "epoch": 2.88, + "learning_rate": 3.559594251901944e-05, + "loss": 0.7936, + "step": 3408, + "task_loss": 0.4200798571109772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5677063465118408, + "epoch": 2.88, + "learning_rate": 3.559171597633136e-05, + "loss": 0.6632, + "step": 3409, + "task_loss": 0.49648523330688477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6962506771087646, + "epoch": 2.88, + "learning_rate": 3.558748943364328e-05, + "loss": 0.6746, + "step": 3410, + "task_loss": 0.602637529373169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46902984380722046, + "epoch": 2.88, + "learning_rate": 3.55832628909552e-05, + "loss": 0.6844, + "step": 3411, + "task_loss": 1.2651164531707764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.937312126159668, + "epoch": 2.88, + "learning_rate": 3.557903634826712e-05, + "loss": 0.6725, + "step": 3412, + "task_loss": 0.8039719462394714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6857679486274719, + "epoch": 2.88, + "learning_rate": 3.557480980557904e-05, + "loss": 0.6839, + "step": 3413, + "task_loss": 0.811154842376709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.514104962348938, + "epoch": 2.89, + "learning_rate": 3.5570583262890954e-05, + "loss": 0.7477, + "step": 3414, + "task_loss": 1.1416230201721191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5920051336288452, + "epoch": 2.89, + "learning_rate": 3.5566356720202874e-05, + "loss": 0.8937, + "step": 3415, + "task_loss": 1.3573713302612305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5068243145942688, + "epoch": 2.89, + "learning_rate": 3.5562130177514794e-05, + "loss": 0.715, + "step": 3416, + "task_loss": 0.8114786148071289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6290361881256104, + "epoch": 2.89, + "learning_rate": 3.5557903634826713e-05, + "loss": 0.8212, + "step": 3417, + "task_loss": 1.0252795219421387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9865380525588989, + "epoch": 2.89, + "learning_rate": 3.555367709213863e-05, + "loss": 0.8204, + "step": 3418, + "task_loss": 1.1151162385940552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6453908085823059, + "epoch": 2.89, + "learning_rate": 3.554945054945055e-05, + "loss": 0.6628, + "step": 3419, + "task_loss": 0.5817614197731018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1869101524353027, + "epoch": 2.89, + "learning_rate": 3.5545224006762466e-05, + "loss": 0.8956, + "step": 3420, + "task_loss": 1.7280217409133911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7250593900680542, + "epoch": 2.89, + "learning_rate": 3.5540997464074386e-05, + "loss": 0.7479, + "step": 3421, + "task_loss": 1.6045708656311035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9519643783569336, + "epoch": 2.89, + "learning_rate": 3.5536770921386305e-05, + "loss": 0.8224, + "step": 3422, + "task_loss": 1.6881781816482544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4207119941711426, + "epoch": 2.89, + "learning_rate": 3.553254437869823e-05, + "loss": 0.6449, + "step": 3423, + "task_loss": 0.5683922171592712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5113493800163269, + "epoch": 2.89, + "learning_rate": 3.5528317836010145e-05, + "loss": 0.7558, + "step": 3424, + "task_loss": 0.14405347406864166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9423756003379822, + "epoch": 2.89, + "learning_rate": 3.5524091293322065e-05, + "loss": 0.6044, + "step": 3425, + "task_loss": 1.99700927734375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.783976674079895, + "epoch": 2.9, + "learning_rate": 3.5519864750633984e-05, + "loss": 0.8013, + "step": 3426, + "task_loss": 1.6793901920318604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.562309205532074, + "epoch": 2.9, + "learning_rate": 3.55156382079459e-05, + "loss": 0.7635, + "step": 3427, + "task_loss": 0.7161197066307068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7572007179260254, + "epoch": 2.9, + "learning_rate": 3.5511411665257824e-05, + "loss": 0.7213, + "step": 3428, + "task_loss": 0.5464855432510376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.651614785194397, + "epoch": 2.9, + "learning_rate": 3.5507185122569743e-05, + "loss": 0.6467, + "step": 3429, + "task_loss": 0.7751861214637756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8770782947540283, + "epoch": 2.9, + "learning_rate": 3.5502958579881656e-05, + "loss": 0.8095, + "step": 3430, + "task_loss": 0.8769362568855286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40966886281967163, + "epoch": 2.9, + "learning_rate": 3.5498732037193576e-05, + "loss": 0.4654, + "step": 3431, + "task_loss": 0.4285908639431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9747872352600098, + "epoch": 2.9, + "learning_rate": 3.5494505494505496e-05, + "loss": 0.7261, + "step": 3432, + "task_loss": 1.238671898841858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.706784725189209, + "epoch": 2.9, + "learning_rate": 3.5490278951817416e-05, + "loss": 0.8105, + "step": 3433, + "task_loss": 0.6514732837677002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.041033387184143, + "epoch": 2.9, + "learning_rate": 3.5486052409129335e-05, + "loss": 0.6823, + "step": 3434, + "task_loss": 1.0296673774719238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6060007214546204, + "epoch": 2.9, + "learning_rate": 3.5481825866441255e-05, + "loss": 0.8608, + "step": 3435, + "task_loss": 1.479670524597168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0092724561691284, + "epoch": 2.9, + "learning_rate": 3.547759932375317e-05, + "loss": 0.7246, + "step": 3436, + "task_loss": 1.5964360237121582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5766658782958984, + "epoch": 2.9, + "learning_rate": 3.547337278106509e-05, + "loss": 0.5895, + "step": 3437, + "task_loss": 0.821502685546875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7535123229026794, + "epoch": 2.91, + "learning_rate": 3.546914623837701e-05, + "loss": 0.6715, + "step": 3438, + "task_loss": 0.3763297498226166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1241356134414673, + "epoch": 2.91, + "learning_rate": 3.546491969568893e-05, + "loss": 0.7766, + "step": 3439, + "task_loss": 1.713033676147461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6541032791137695, + "epoch": 2.91, + "learning_rate": 3.546069315300085e-05, + "loss": 0.6446, + "step": 3440, + "task_loss": 0.9213873147964478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6459155678749084, + "epoch": 2.91, + "learning_rate": 3.545646661031277e-05, + "loss": 0.8694, + "step": 3441, + "task_loss": 1.4849766492843628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6222901344299316, + "epoch": 2.91, + "learning_rate": 3.5452240067624687e-05, + "loss": 0.8048, + "step": 3442, + "task_loss": 0.6319260597229004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1128098964691162, + "epoch": 2.91, + "learning_rate": 3.54480135249366e-05, + "loss": 1.1003, + "step": 3443, + "task_loss": 0.625972330570221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.435336709022522, + "epoch": 2.91, + "learning_rate": 3.544378698224852e-05, + "loss": 0.5286, + "step": 3444, + "task_loss": 0.2884848713874817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4941648542881012, + "epoch": 2.91, + "learning_rate": 3.5439560439560446e-05, + "loss": 0.8175, + "step": 3445, + "task_loss": 1.3311123847961426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7279483079910278, + "epoch": 2.91, + "learning_rate": 3.543533389687236e-05, + "loss": 0.724, + "step": 3446, + "task_loss": 0.3616481423377991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9487766027450562, + "epoch": 2.91, + "learning_rate": 3.543110735418428e-05, + "loss": 0.7124, + "step": 3447, + "task_loss": 1.3311735391616821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9165492057800293, + "epoch": 2.91, + "learning_rate": 3.54268808114962e-05, + "loss": 0.8024, + "step": 3448, + "task_loss": 1.0065410137176514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0618176460266113, + "epoch": 2.91, + "learning_rate": 3.542265426880811e-05, + "loss": 0.8527, + "step": 3449, + "task_loss": 1.2488597631454468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.09998619556427, + "epoch": 2.92, + "learning_rate": 3.541842772612004e-05, + "loss": 0.8213, + "step": 3450, + "task_loss": 0.708964467048645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7216796278953552, + "epoch": 2.92, + "learning_rate": 3.541420118343196e-05, + "loss": 0.851, + "step": 3451, + "task_loss": 1.4598872661590576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7261360883712769, + "epoch": 2.92, + "learning_rate": 3.540997464074388e-05, + "loss": 0.7679, + "step": 3452, + "task_loss": 0.5757009387016296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3547706604003906, + "epoch": 2.92, + "learning_rate": 3.540574809805579e-05, + "loss": 0.6047, + "step": 3453, + "task_loss": 0.6476296782493591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8629271388053894, + "epoch": 2.92, + "learning_rate": 3.540152155536771e-05, + "loss": 0.8117, + "step": 3454, + "task_loss": 0.8801674246788025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43172964453697205, + "epoch": 2.92, + "learning_rate": 3.539729501267963e-05, + "loss": 0.5993, + "step": 3455, + "task_loss": 2.2454917430877686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5369665026664734, + "epoch": 2.92, + "learning_rate": 3.539306846999155e-05, + "loss": 0.6634, + "step": 3456, + "task_loss": 1.1113795042037964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7071578502655029, + "epoch": 2.92, + "learning_rate": 3.538884192730347e-05, + "loss": 0.7327, + "step": 3457, + "task_loss": 0.6536293029785156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.934321403503418, + "epoch": 2.92, + "learning_rate": 3.538461538461539e-05, + "loss": 0.7208, + "step": 3458, + "task_loss": 0.9927128553390503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8764635920524597, + "epoch": 2.92, + "learning_rate": 3.53803888419273e-05, + "loss": 0.6167, + "step": 3459, + "task_loss": 0.6740485429763794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9551503658294678, + "epoch": 2.92, + "learning_rate": 3.537616229923922e-05, + "loss": 0.779, + "step": 3460, + "task_loss": 0.9154787063598633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6708633899688721, + "epoch": 2.93, + "learning_rate": 3.537193575655114e-05, + "loss": 0.5018, + "step": 3461, + "task_loss": 0.23266105353832245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42945969104766846, + "epoch": 2.93, + "learning_rate": 3.536770921386306e-05, + "loss": 0.5971, + "step": 3462, + "task_loss": 1.6702088117599487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5405272841453552, + "epoch": 2.93, + "learning_rate": 3.536348267117498e-05, + "loss": 0.7712, + "step": 3463, + "task_loss": 1.674409031867981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0547758340835571, + "epoch": 2.93, + "learning_rate": 3.53592561284869e-05, + "loss": 0.7497, + "step": 3464, + "task_loss": 0.7672004699707031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8756707906723022, + "epoch": 2.93, + "learning_rate": 3.5355029585798813e-05, + "loss": 0.6427, + "step": 3465, + "task_loss": 0.7010441422462463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6829769611358643, + "epoch": 2.93, + "learning_rate": 3.535080304311073e-05, + "loss": 0.7786, + "step": 3466, + "task_loss": 0.35273051261901855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3636772036552429, + "epoch": 2.93, + "learning_rate": 3.534657650042266e-05, + "loss": 0.6732, + "step": 3467, + "task_loss": 0.5016867518424988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.559635579586029, + "epoch": 2.93, + "learning_rate": 3.534234995773458e-05, + "loss": 0.5576, + "step": 3468, + "task_loss": 0.3644520938396454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7553682923316956, + "epoch": 2.93, + "learning_rate": 3.533812341504649e-05, + "loss": 0.7665, + "step": 3469, + "task_loss": 1.2647511959075928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.55854332447052, + "epoch": 2.93, + "learning_rate": 3.533389687235841e-05, + "loss": 0.8079, + "step": 3470, + "task_loss": 1.339534044265747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6029951572418213, + "epoch": 2.93, + "learning_rate": 3.532967032967033e-05, + "loss": 0.7349, + "step": 3471, + "task_loss": 0.6847051978111267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4803905785083771, + "epoch": 2.93, + "learning_rate": 3.532544378698225e-05, + "loss": 0.7698, + "step": 3472, + "task_loss": 0.39300790429115295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6557621359825134, + "epoch": 2.94, + "learning_rate": 3.532121724429417e-05, + "loss": 0.8, + "step": 3473, + "task_loss": 0.689056932926178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38581717014312744, + "epoch": 2.94, + "learning_rate": 3.531699070160609e-05, + "loss": 0.6589, + "step": 3474, + "task_loss": 1.1494646072387695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7917265892028809, + "epoch": 2.94, + "learning_rate": 3.5312764158918004e-05, + "loss": 0.8605, + "step": 3475, + "task_loss": 0.4792248010635376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4674330949783325, + "epoch": 2.94, + "learning_rate": 3.5308537616229924e-05, + "loss": 0.6509, + "step": 3476, + "task_loss": 0.5007020235061646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7342607378959656, + "epoch": 2.94, + "learning_rate": 3.5304311073541844e-05, + "loss": 0.6254, + "step": 3477, + "task_loss": 0.6376438736915588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.569838285446167, + "epoch": 2.94, + "learning_rate": 3.530008453085376e-05, + "loss": 0.7327, + "step": 3478, + "task_loss": 0.7260635495185852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5506277084350586, + "epoch": 2.94, + "learning_rate": 3.529585798816568e-05, + "loss": 0.8038, + "step": 3479, + "task_loss": 1.100770354270935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0023845434188843, + "epoch": 2.94, + "learning_rate": 3.52916314454776e-05, + "loss": 0.7105, + "step": 3480, + "task_loss": 1.0005943775177002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.136847734451294, + "epoch": 2.94, + "learning_rate": 3.528740490278952e-05, + "loss": 0.9255, + "step": 3481, + "task_loss": 1.7830514907836914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.02586030960083, + "epoch": 2.94, + "learning_rate": 3.5283178360101435e-05, + "loss": 0.8018, + "step": 3482, + "task_loss": 0.7019370794296265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47710490226745605, + "epoch": 2.94, + "learning_rate": 3.5278951817413355e-05, + "loss": 0.5686, + "step": 3483, + "task_loss": 0.22217045724391937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8275421857833862, + "epoch": 2.94, + "learning_rate": 3.527472527472528e-05, + "loss": 0.7999, + "step": 3484, + "task_loss": 1.4307249784469604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2308499813079834, + "epoch": 2.95, + "learning_rate": 3.5270498732037195e-05, + "loss": 0.84, + "step": 3485, + "task_loss": 1.6488863229751587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5758970975875854, + "epoch": 2.95, + "learning_rate": 3.5266272189349114e-05, + "loss": 0.7943, + "step": 3486, + "task_loss": 0.3984098434448242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9738062620162964, + "epoch": 2.95, + "learning_rate": 3.5262045646661034e-05, + "loss": 0.8309, + "step": 3487, + "task_loss": 1.131583333015442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6573241353034973, + "epoch": 2.95, + "learning_rate": 3.525781910397295e-05, + "loss": 0.6877, + "step": 3488, + "task_loss": 0.5549317002296448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0458537340164185, + "epoch": 2.95, + "learning_rate": 3.5253592561284874e-05, + "loss": 0.9446, + "step": 3489, + "task_loss": 1.9252161979675293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6736923456192017, + "epoch": 2.95, + "learning_rate": 3.524936601859679e-05, + "loss": 0.4841, + "step": 3490, + "task_loss": 0.46683552861213684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.806762158870697, + "epoch": 2.95, + "learning_rate": 3.5245139475908706e-05, + "loss": 0.734, + "step": 3491, + "task_loss": 0.7796412706375122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4483540654182434, + "epoch": 2.95, + "learning_rate": 3.5240912933220626e-05, + "loss": 0.6219, + "step": 3492, + "task_loss": 0.4687820076942444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39906173944473267, + "epoch": 2.95, + "learning_rate": 3.5236686390532546e-05, + "loss": 0.6527, + "step": 3493, + "task_loss": 0.1871040165424347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7199836373329163, + "epoch": 2.95, + "learning_rate": 3.5232459847844466e-05, + "loss": 0.9041, + "step": 3494, + "task_loss": 0.9986658692359924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5101284384727478, + "epoch": 2.95, + "learning_rate": 3.5228233305156385e-05, + "loss": 0.5592, + "step": 3495, + "task_loss": 0.9976024031639099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7561028003692627, + "epoch": 2.95, + "learning_rate": 3.5224006762468305e-05, + "loss": 0.7077, + "step": 3496, + "task_loss": 1.2883236408233643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0115514993667603, + "epoch": 2.96, + "learning_rate": 3.5219780219780225e-05, + "loss": 0.7671, + "step": 3497, + "task_loss": 1.0213358402252197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6214433908462524, + "epoch": 2.96, + "learning_rate": 3.521555367709214e-05, + "loss": 0.7222, + "step": 3498, + "task_loss": 1.0607128143310547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8613982796669006, + "epoch": 2.96, + "learning_rate": 3.521132713440406e-05, + "loss": 0.8158, + "step": 3499, + "task_loss": 0.6398619413375854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8451910614967346, + "epoch": 2.96, + "learning_rate": 3.520710059171598e-05, + "loss": 0.5851, + "step": 3500, + "task_loss": 1.3154056072235107 + }, + { + "epoch": 2.96, + "eval_accuracy": 0.8891881188118812, + "eval_loss": 0.45988893508911133, + "eval_runtime": 228.6581, + "eval_samples_per_second": 110.427, + "eval_steps_per_second": 0.866, + "step": 3500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7575498223304749, + "epoch": 2.96, + "learning_rate": 3.52028740490279e-05, + "loss": 0.6651, + "step": 3501, + "task_loss": 1.1811325550079346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6149915456771851, + "epoch": 2.96, + "learning_rate": 3.519864750633982e-05, + "loss": 0.7076, + "step": 3502, + "task_loss": 0.3247385025024414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7450119853019714, + "epoch": 2.96, + "learning_rate": 3.5194420963651736e-05, + "loss": 0.5744, + "step": 3503, + "task_loss": 0.24779245257377625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2241921424865723, + "epoch": 2.96, + "learning_rate": 3.519019442096365e-05, + "loss": 0.8492, + "step": 3504, + "task_loss": 2.1540780067443848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4457588791847229, + "epoch": 2.96, + "learning_rate": 3.518596787827557e-05, + "loss": 0.5055, + "step": 3505, + "task_loss": 1.8695106506347656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1786446571350098, + "epoch": 2.96, + "learning_rate": 3.5181741335587496e-05, + "loss": 0.7534, + "step": 3506, + "task_loss": 1.5412825345993042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.611754298210144, + "epoch": 2.96, + "learning_rate": 3.517751479289941e-05, + "loss": 0.8446, + "step": 3507, + "task_loss": 0.8767352104187012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.6720008850097656, + "epoch": 2.96, + "learning_rate": 3.517328825021133e-05, + "loss": 0.9244, + "step": 3508, + "task_loss": 0.9086243510246277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.78460693359375, + "epoch": 2.97, + "learning_rate": 3.516906170752325e-05, + "loss": 0.6703, + "step": 3509, + "task_loss": 1.3038012981414795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.115155577659607, + "epoch": 2.97, + "learning_rate": 3.516483516483517e-05, + "loss": 0.8794, + "step": 3510, + "task_loss": 2.5163519382476807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5943164229393005, + "epoch": 2.97, + "learning_rate": 3.516060862214709e-05, + "loss": 1.1491, + "step": 3511, + "task_loss": 1.702355146408081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6583766937255859, + "epoch": 2.97, + "learning_rate": 3.515638207945901e-05, + "loss": 0.7126, + "step": 3512, + "task_loss": 0.9775614738464355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5614702701568604, + "epoch": 2.97, + "learning_rate": 3.515215553677093e-05, + "loss": 0.6494, + "step": 3513, + "task_loss": 1.4156602621078491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5537071228027344, + "epoch": 2.97, + "learning_rate": 3.514792899408284e-05, + "loss": 0.643, + "step": 3514, + "task_loss": 0.24175675213336945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48728644847869873, + "epoch": 2.97, + "learning_rate": 3.514370245139476e-05, + "loss": 0.8004, + "step": 3515, + "task_loss": 0.9385992288589478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7581866383552551, + "epoch": 2.97, + "learning_rate": 3.513947590870668e-05, + "loss": 0.6006, + "step": 3516, + "task_loss": 0.6312645673751831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6775551438331604, + "epoch": 2.97, + "learning_rate": 3.51352493660186e-05, + "loss": 0.611, + "step": 3517, + "task_loss": 0.7012837529182434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6819434762001038, + "epoch": 2.97, + "learning_rate": 3.513102282333052e-05, + "loss": 0.6328, + "step": 3518, + "task_loss": 0.567051112651825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8982431888580322, + "epoch": 2.97, + "learning_rate": 3.512679628064244e-05, + "loss": 0.8216, + "step": 3519, + "task_loss": 1.0541644096374512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44036123156547546, + "epoch": 2.97, + "learning_rate": 3.512256973795435e-05, + "loss": 0.531, + "step": 3520, + "task_loss": 0.36266809701919556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0773639678955078, + "epoch": 2.98, + "learning_rate": 3.511834319526627e-05, + "loss": 0.6669, + "step": 3521, + "task_loss": 1.2040399312973022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6456725001335144, + "epoch": 2.98, + "learning_rate": 3.511411665257819e-05, + "loss": 0.79, + "step": 3522, + "task_loss": 0.3172779083251953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5409705638885498, + "epoch": 2.98, + "learning_rate": 3.510989010989011e-05, + "loss": 0.7878, + "step": 3523, + "task_loss": 0.7052796483039856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9127213954925537, + "epoch": 2.98, + "learning_rate": 3.510566356720203e-05, + "loss": 0.7125, + "step": 3524, + "task_loss": 0.7552744150161743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1011497974395752, + "epoch": 2.98, + "learning_rate": 3.510143702451395e-05, + "loss": 0.7777, + "step": 3525, + "task_loss": 0.5825923085212708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6759148240089417, + "epoch": 2.98, + "learning_rate": 3.509721048182587e-05, + "loss": 0.8311, + "step": 3526, + "task_loss": 0.709834098815918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7002780437469482, + "epoch": 2.98, + "learning_rate": 3.509298393913778e-05, + "loss": 0.7319, + "step": 3527, + "task_loss": 1.1596548557281494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46024376153945923, + "epoch": 2.98, + "learning_rate": 3.508875739644971e-05, + "loss": 0.7047, + "step": 3528, + "task_loss": 0.6764599680900574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.8885741233825684, + "epoch": 2.98, + "learning_rate": 3.508453085376163e-05, + "loss": 0.8634, + "step": 3529, + "task_loss": 1.3570809364318848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7255014181137085, + "epoch": 2.98, + "learning_rate": 3.508030431107354e-05, + "loss": 0.7611, + "step": 3530, + "task_loss": 2.0896525382995605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5057455897331238, + "epoch": 2.98, + "learning_rate": 3.507607776838546e-05, + "loss": 0.8551, + "step": 3531, + "task_loss": 0.3309262692928314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45254671573638916, + "epoch": 2.99, + "learning_rate": 3.507185122569738e-05, + "loss": 0.7097, + "step": 3532, + "task_loss": 0.39333784580230713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8713363409042358, + "epoch": 2.99, + "learning_rate": 3.5067624683009295e-05, + "loss": 0.8156, + "step": 3533, + "task_loss": 1.6795499324798584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.60808265209198, + "epoch": 2.99, + "learning_rate": 3.506339814032122e-05, + "loss": 0.5878, + "step": 3534, + "task_loss": 0.8184134364128113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8242766857147217, + "epoch": 2.99, + "learning_rate": 3.505917159763314e-05, + "loss": 0.7817, + "step": 3535, + "task_loss": 0.9278186559677124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5657252073287964, + "epoch": 2.99, + "learning_rate": 3.5054945054945054e-05, + "loss": 0.7697, + "step": 3536, + "task_loss": 0.5133640766143799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6599215865135193, + "epoch": 2.99, + "learning_rate": 3.5050718512256974e-05, + "loss": 0.5593, + "step": 3537, + "task_loss": 1.3066964149475098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37537139654159546, + "epoch": 2.99, + "learning_rate": 3.504649196956889e-05, + "loss": 0.6747, + "step": 3538, + "task_loss": 0.8508651852607727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6105172038078308, + "epoch": 2.99, + "learning_rate": 3.504226542688081e-05, + "loss": 0.676, + "step": 3539, + "task_loss": 1.255632996559143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47263914346694946, + "epoch": 2.99, + "learning_rate": 3.503803888419273e-05, + "loss": 0.7332, + "step": 3540, + "task_loss": 1.306686520576477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6724578142166138, + "epoch": 2.99, + "learning_rate": 3.503381234150465e-05, + "loss": 0.5942, + "step": 3541, + "task_loss": 0.9191336035728455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6361077427864075, + "epoch": 2.99, + "learning_rate": 3.502958579881657e-05, + "loss": 0.6647, + "step": 3542, + "task_loss": 0.6519774198532104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6790368556976318, + "epoch": 2.99, + "learning_rate": 3.5025359256128485e-05, + "loss": 0.6108, + "step": 3543, + "task_loss": 1.3175681829452515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39566344022750854, + "epoch": 3.0, + "learning_rate": 3.5021132713440405e-05, + "loss": 0.8359, + "step": 3544, + "task_loss": 0.7916802763938904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4797672629356384, + "epoch": 3.0, + "learning_rate": 3.501690617075233e-05, + "loss": 0.6771, + "step": 3545, + "task_loss": 0.8603438138961792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1398463249206543, + "epoch": 3.0, + "learning_rate": 3.5012679628064244e-05, + "loss": 0.8374, + "step": 3546, + "task_loss": 0.814414381980896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6054514050483704, + "epoch": 3.0, + "learning_rate": 3.5008453085376164e-05, + "loss": 0.6187, + "step": 3547, + "task_loss": 0.8762628436088562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5500036478042603, + "epoch": 3.0, + "learning_rate": 3.5004226542688084e-05, + "loss": 0.4909, + "step": 3548, + "task_loss": 0.31093916296958923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8774495124816895, + "epoch": 3.0, + "learning_rate": 3.5e-05, + "loss": 0.644, + "step": 3549, + "task_loss": 0.39253199100494385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43953657150268555, + "epoch": 3.0, + "learning_rate": 3.499577345731192e-05, + "loss": 1.3566, + "step": 3550, + "task_loss": 0.8424968123435974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4337252974510193, + "epoch": 3.0, + "learning_rate": 3.499154691462384e-05, + "loss": 0.4779, + "step": 3551, + "task_loss": 0.44218409061431885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8042478561401367, + "epoch": 3.0, + "learning_rate": 3.4987320371935756e-05, + "loss": 0.6824, + "step": 3552, + "task_loss": 0.3617814779281616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6113454103469849, + "epoch": 3.0, + "learning_rate": 3.4983093829247676e-05, + "loss": 0.511, + "step": 3553, + "task_loss": 1.1056156158447266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8372128009796143, + "epoch": 3.0, + "learning_rate": 3.4978867286559596e-05, + "loss": 0.6523, + "step": 3554, + "task_loss": 0.5932086110115051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29487931728363037, + "epoch": 3.01, + "learning_rate": 3.4974640743871515e-05, + "loss": 0.6849, + "step": 3555, + "task_loss": 0.6693393588066101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.965597927570343, + "epoch": 3.01, + "learning_rate": 3.4970414201183435e-05, + "loss": 0.6847, + "step": 3556, + "task_loss": 0.8072828650474548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3738608956336975, + "epoch": 3.01, + "learning_rate": 3.4966187658495355e-05, + "loss": 0.5951, + "step": 3557, + "task_loss": 0.30251047015190125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6709148287773132, + "epoch": 3.01, + "learning_rate": 3.4961961115807275e-05, + "loss": 0.701, + "step": 3558, + "task_loss": 0.7460740208625793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5601727962493896, + "epoch": 3.01, + "learning_rate": 3.495773457311919e-05, + "loss": 0.741, + "step": 3559, + "task_loss": 0.7785150408744812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8178591132164001, + "epoch": 3.01, + "learning_rate": 3.495350803043111e-05, + "loss": 0.6913, + "step": 3560, + "task_loss": 1.414636492729187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8537206053733826, + "epoch": 3.01, + "learning_rate": 3.494928148774303e-05, + "loss": 0.7244, + "step": 3561, + "task_loss": 1.7884440422058105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6941735744476318, + "epoch": 3.01, + "learning_rate": 3.494505494505495e-05, + "loss": 0.7028, + "step": 3562, + "task_loss": 0.658213198184967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5836626291275024, + "epoch": 3.01, + "learning_rate": 3.4940828402366866e-05, + "loss": 0.8885, + "step": 3563, + "task_loss": 0.5522605180740356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.891832709312439, + "epoch": 3.01, + "learning_rate": 3.4936601859678786e-05, + "loss": 0.7207, + "step": 3564, + "task_loss": 0.9203711748123169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3659219741821289, + "epoch": 3.01, + "learning_rate": 3.49323753169907e-05, + "loss": 0.6811, + "step": 3565, + "task_loss": 0.881561279296875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0173832178115845, + "epoch": 3.01, + "learning_rate": 3.492814877430262e-05, + "loss": 0.5866, + "step": 3566, + "task_loss": 0.4808333218097687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1727278232574463, + "epoch": 3.02, + "learning_rate": 3.492392223161454e-05, + "loss": 0.9874, + "step": 3567, + "task_loss": 1.234318733215332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6782771348953247, + "epoch": 3.02, + "learning_rate": 3.4919695688926465e-05, + "loss": 0.6266, + "step": 3568, + "task_loss": 1.586388349533081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7511957883834839, + "epoch": 3.02, + "learning_rate": 3.491546914623838e-05, + "loss": 0.7233, + "step": 3569, + "task_loss": 0.6499571800231934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1829522848129272, + "epoch": 3.02, + "learning_rate": 3.49112426035503e-05, + "loss": 0.8081, + "step": 3570, + "task_loss": 1.551069736480713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4098557233810425, + "epoch": 3.02, + "learning_rate": 3.490701606086222e-05, + "loss": 0.7652, + "step": 3571, + "task_loss": 0.5927712321281433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9034950733184814, + "epoch": 3.02, + "learning_rate": 3.490278951817413e-05, + "loss": 1.049, + "step": 3572, + "task_loss": 0.8563557267189026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5900263786315918, + "epoch": 3.02, + "learning_rate": 3.489856297548606e-05, + "loss": 0.6691, + "step": 3573, + "task_loss": 1.4217010736465454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41364336013793945, + "epoch": 3.02, + "learning_rate": 3.489433643279798e-05, + "loss": 0.5466, + "step": 3574, + "task_loss": 0.5450477004051208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6399833559989929, + "epoch": 3.02, + "learning_rate": 3.489010989010989e-05, + "loss": 0.7892, + "step": 3575, + "task_loss": 0.9992542862892151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7601376175880432, + "epoch": 3.02, + "learning_rate": 3.488588334742181e-05, + "loss": 0.5612, + "step": 3576, + "task_loss": 0.8570849895477295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5140930414199829, + "epoch": 3.02, + "learning_rate": 3.488165680473373e-05, + "loss": 0.6433, + "step": 3577, + "task_loss": 0.5177265405654907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42960435152053833, + "epoch": 3.02, + "learning_rate": 3.487743026204565e-05, + "loss": 0.7144, + "step": 3578, + "task_loss": 0.3304111361503601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5891939401626587, + "epoch": 3.03, + "learning_rate": 3.487320371935757e-05, + "loss": 0.5223, + "step": 3579, + "task_loss": 0.6053823232650757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4822579622268677, + "epoch": 3.03, + "learning_rate": 3.486897717666949e-05, + "loss": 0.5621, + "step": 3580, + "task_loss": 0.4804011285305023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6441078782081604, + "epoch": 3.03, + "learning_rate": 3.48647506339814e-05, + "loss": 0.743, + "step": 3581, + "task_loss": 1.1420804262161255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5860291123390198, + "epoch": 3.03, + "learning_rate": 3.486052409129332e-05, + "loss": 0.6635, + "step": 3582, + "task_loss": 0.6237084269523621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.582332968711853, + "epoch": 3.03, + "learning_rate": 3.485629754860524e-05, + "loss": 0.7599, + "step": 3583, + "task_loss": 1.3454017639160156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8366926908493042, + "epoch": 3.03, + "learning_rate": 3.485207100591716e-05, + "loss": 0.5525, + "step": 3584, + "task_loss": 0.9461635947227478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44944024085998535, + "epoch": 3.03, + "learning_rate": 3.484784446322908e-05, + "loss": 0.7438, + "step": 3585, + "task_loss": 0.6866551637649536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7555097341537476, + "epoch": 3.03, + "learning_rate": 3.4843617920541e-05, + "loss": 0.5589, + "step": 3586, + "task_loss": 0.9157886505126953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4893421530723572, + "epoch": 3.03, + "learning_rate": 3.483939137785292e-05, + "loss": 0.7171, + "step": 3587, + "task_loss": 1.3617891073226929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4786592423915863, + "epoch": 3.03, + "learning_rate": 3.483516483516483e-05, + "loss": 0.6531, + "step": 3588, + "task_loss": 0.7157772779464722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44599664211273193, + "epoch": 3.03, + "learning_rate": 3.483093829247675e-05, + "loss": 0.4893, + "step": 3589, + "task_loss": 0.4589158296585083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0318453311920166, + "epoch": 3.03, + "learning_rate": 3.482671174978868e-05, + "loss": 0.7698, + "step": 3590, + "task_loss": 0.9067234396934509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0646498203277588, + "epoch": 3.04, + "learning_rate": 3.482248520710059e-05, + "loss": 0.8219, + "step": 3591, + "task_loss": 1.3337565660476685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8827232122421265, + "epoch": 3.04, + "learning_rate": 3.481825866441251e-05, + "loss": 0.6953, + "step": 3592, + "task_loss": 1.154910683631897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3647899627685547, + "epoch": 3.04, + "learning_rate": 3.481403212172443e-05, + "loss": 0.513, + "step": 3593, + "task_loss": 0.4109800159931183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5805737972259521, + "epoch": 3.04, + "learning_rate": 3.4809805579036345e-05, + "loss": 0.8607, + "step": 3594, + "task_loss": 1.0610694885253906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5662297606468201, + "epoch": 3.04, + "learning_rate": 3.480557903634827e-05, + "loss": 0.5418, + "step": 3595, + "task_loss": 0.47057056427001953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9658246040344238, + "epoch": 3.04, + "learning_rate": 3.480135249366019e-05, + "loss": 0.669, + "step": 3596, + "task_loss": 0.48805683851242065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.108781099319458, + "epoch": 3.04, + "learning_rate": 3.479712595097211e-05, + "loss": 0.7186, + "step": 3597, + "task_loss": 1.9384948015213013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5495907068252563, + "epoch": 3.04, + "learning_rate": 3.4792899408284023e-05, + "loss": 0.6172, + "step": 3598, + "task_loss": 1.0088746547698975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4112240672111511, + "epoch": 3.04, + "learning_rate": 3.478867286559594e-05, + "loss": 0.5191, + "step": 3599, + "task_loss": 0.6627725958824158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.058617115020752, + "epoch": 3.04, + "learning_rate": 3.478444632290786e-05, + "loss": 0.7691, + "step": 3600, + "task_loss": 0.6134690642356873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8195458650588989, + "epoch": 3.04, + "learning_rate": 3.478021978021978e-05, + "loss": 0.6174, + "step": 3601, + "task_loss": 2.2314844131469727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9551973342895508, + "epoch": 3.04, + "learning_rate": 3.47759932375317e-05, + "loss": 0.9318, + "step": 3602, + "task_loss": 2.072413921356201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6425678730010986, + "epoch": 3.05, + "learning_rate": 3.477176669484362e-05, + "loss": 0.5524, + "step": 3603, + "task_loss": 0.7999861836433411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.940825879573822, + "epoch": 3.05, + "learning_rate": 3.4767540152155535e-05, + "loss": 0.6936, + "step": 3604, + "task_loss": 0.6632443070411682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7223283648490906, + "epoch": 3.05, + "learning_rate": 3.4763313609467455e-05, + "loss": 0.552, + "step": 3605, + "task_loss": 0.20343735814094543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46318188309669495, + "epoch": 3.05, + "learning_rate": 3.4759087066779375e-05, + "loss": 0.5071, + "step": 3606, + "task_loss": 0.7007144093513489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9650071263313293, + "epoch": 3.05, + "learning_rate": 3.4754860524091294e-05, + "loss": 0.5177, + "step": 3607, + "task_loss": 0.4019029140472412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3849183320999146, + "epoch": 3.05, + "learning_rate": 3.4750633981403214e-05, + "loss": 0.9365, + "step": 3608, + "task_loss": 0.7168657779693604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6050105690956116, + "epoch": 3.05, + "learning_rate": 3.4746407438715134e-05, + "loss": 0.642, + "step": 3609, + "task_loss": 0.5087983012199402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43330299854278564, + "epoch": 3.05, + "learning_rate": 3.474218089602705e-05, + "loss": 0.5827, + "step": 3610, + "task_loss": 0.5415623784065247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5316822528839111, + "epoch": 3.05, + "learning_rate": 3.4737954353338967e-05, + "loss": 0.5948, + "step": 3611, + "task_loss": 0.6772130131721497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4631333649158478, + "epoch": 3.05, + "learning_rate": 3.473372781065089e-05, + "loss": 0.5518, + "step": 3612, + "task_loss": 0.5316349267959595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48041319847106934, + "epoch": 3.05, + "learning_rate": 3.472950126796281e-05, + "loss": 0.716, + "step": 3613, + "task_loss": 0.3687160313129425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9606984853744507, + "epoch": 3.05, + "learning_rate": 3.4725274725274726e-05, + "loss": 0.7077, + "step": 3614, + "task_loss": 0.9231090545654297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38540130853652954, + "epoch": 3.06, + "learning_rate": 3.4721048182586645e-05, + "loss": 0.7155, + "step": 3615, + "task_loss": 0.4642612040042877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5279570817947388, + "epoch": 3.06, + "learning_rate": 3.4716821639898565e-05, + "loss": 0.7943, + "step": 3616, + "task_loss": 0.2436733841896057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5405007004737854, + "epoch": 3.06, + "learning_rate": 3.4712595097210485e-05, + "loss": 0.695, + "step": 3617, + "task_loss": 0.8684276342391968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7234306931495667, + "epoch": 3.06, + "learning_rate": 3.4708368554522405e-05, + "loss": 0.6789, + "step": 3618, + "task_loss": 0.9345763325691223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5793839693069458, + "epoch": 3.06, + "learning_rate": 3.4704142011834324e-05, + "loss": 0.655, + "step": 3619, + "task_loss": 0.1378188282251358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48766395449638367, + "epoch": 3.06, + "learning_rate": 3.469991546914624e-05, + "loss": 0.7424, + "step": 3620, + "task_loss": 0.16840820014476776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9185991287231445, + "epoch": 3.06, + "learning_rate": 3.469568892645816e-05, + "loss": 0.885, + "step": 3621, + "task_loss": 0.7222850322723389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6478458642959595, + "epoch": 3.06, + "learning_rate": 3.469146238377008e-05, + "loss": 0.6751, + "step": 3622, + "task_loss": 0.9369020462036133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43672165274620056, + "epoch": 3.06, + "learning_rate": 3.4687235841081997e-05, + "loss": 0.5366, + "step": 3623, + "task_loss": 0.6579287648200989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42594438791275024, + "epoch": 3.06, + "learning_rate": 3.4683009298393916e-05, + "loss": 0.7697, + "step": 3624, + "task_loss": 0.7111186981201172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6135975122451782, + "epoch": 3.06, + "learning_rate": 3.4678782755705836e-05, + "loss": 0.5485, + "step": 3625, + "task_loss": 0.9757449626922607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9196648001670837, + "epoch": 3.07, + "learning_rate": 3.467455621301775e-05, + "loss": 0.6215, + "step": 3626, + "task_loss": 1.1645159721374512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5087860822677612, + "epoch": 3.07, + "learning_rate": 3.467032967032967e-05, + "loss": 0.5705, + "step": 3627, + "task_loss": 0.4755611717700958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1587384939193726, + "epoch": 3.07, + "learning_rate": 3.466610312764159e-05, + "loss": 0.8849, + "step": 3628, + "task_loss": 0.7653050422668457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4880496859550476, + "epoch": 3.07, + "learning_rate": 3.4661876584953515e-05, + "loss": 0.5321, + "step": 3629, + "task_loss": 0.6090231537818909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1166832447052002, + "epoch": 3.07, + "learning_rate": 3.465765004226543e-05, + "loss": 0.6751, + "step": 3630, + "task_loss": 0.778313159942627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8939477801322937, + "epoch": 3.07, + "learning_rate": 3.465342349957735e-05, + "loss": 0.6275, + "step": 3631, + "task_loss": 1.2108412981033325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8347808718681335, + "epoch": 3.07, + "learning_rate": 3.464919695688927e-05, + "loss": 0.7194, + "step": 3632, + "task_loss": 0.6223213076591492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7574350833892822, + "epoch": 3.07, + "learning_rate": 3.464497041420118e-05, + "loss": 0.8716, + "step": 3633, + "task_loss": 1.2905359268188477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6879938840866089, + "epoch": 3.07, + "learning_rate": 3.464074387151311e-05, + "loss": 0.6335, + "step": 3634, + "task_loss": 0.3570175766944885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3733072578907013, + "epoch": 3.07, + "learning_rate": 3.463651732882503e-05, + "loss": 0.721, + "step": 3635, + "task_loss": 0.4897845983505249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6140908598899841, + "epoch": 3.07, + "learning_rate": 3.463229078613694e-05, + "loss": 0.61, + "step": 3636, + "task_loss": 0.3571910262107849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5097900629043579, + "epoch": 3.07, + "learning_rate": 3.462806424344886e-05, + "loss": 0.7031, + "step": 3637, + "task_loss": 1.0409501791000366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5055532455444336, + "epoch": 3.08, + "learning_rate": 3.462383770076078e-05, + "loss": 0.6336, + "step": 3638, + "task_loss": 0.7054551243782043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6209069490432739, + "epoch": 3.08, + "learning_rate": 3.46196111580727e-05, + "loss": 0.5907, + "step": 3639, + "task_loss": 0.24482828378677368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7833565473556519, + "epoch": 3.08, + "learning_rate": 3.461538461538462e-05, + "loss": 0.8233, + "step": 3640, + "task_loss": 1.542444109916687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5171521902084351, + "epoch": 3.08, + "learning_rate": 3.461115807269654e-05, + "loss": 0.6699, + "step": 3641, + "task_loss": 0.10688822716474533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7827117443084717, + "epoch": 3.08, + "learning_rate": 3.460693153000846e-05, + "loss": 0.6743, + "step": 3642, + "task_loss": 1.0264723300933838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3677600026130676, + "epoch": 3.08, + "learning_rate": 3.460270498732037e-05, + "loss": 0.6553, + "step": 3643, + "task_loss": 0.49530482292175293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4029976725578308, + "epoch": 3.08, + "learning_rate": 3.459847844463229e-05, + "loss": 0.5039, + "step": 3644, + "task_loss": 0.7688202261924744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5486735105514526, + "epoch": 3.08, + "learning_rate": 3.459425190194421e-05, + "loss": 0.4963, + "step": 3645, + "task_loss": 0.11027728021144867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3741183280944824, + "epoch": 3.08, + "learning_rate": 3.459002535925613e-05, + "loss": 0.5792, + "step": 3646, + "task_loss": 0.5936026573181152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6782668828964233, + "epoch": 3.08, + "learning_rate": 3.458579881656805e-05, + "loss": 0.6633, + "step": 3647, + "task_loss": 0.6288702487945557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4231131076812744, + "epoch": 3.08, + "learning_rate": 3.458157227387997e-05, + "loss": 0.8048, + "step": 3648, + "task_loss": 0.930057168006897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6348741054534912, + "epoch": 3.08, + "learning_rate": 3.457734573119188e-05, + "loss": 0.6539, + "step": 3649, + "task_loss": 0.40459612011909485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5477964878082275, + "epoch": 3.09, + "learning_rate": 3.45731191885038e-05, + "loss": 0.643, + "step": 3650, + "task_loss": 1.3185477256774902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46042338013648987, + "epoch": 3.09, + "learning_rate": 3.456889264581573e-05, + "loss": 0.7995, + "step": 3651, + "task_loss": 0.8438501358032227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8068141341209412, + "epoch": 3.09, + "learning_rate": 3.456466610312764e-05, + "loss": 0.7769, + "step": 3652, + "task_loss": 0.9261831045150757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.705544114112854, + "epoch": 3.09, + "learning_rate": 3.456043956043956e-05, + "loss": 0.643, + "step": 3653, + "task_loss": 0.6667105555534363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.283825397491455, + "epoch": 3.09, + "learning_rate": 3.455621301775148e-05, + "loss": 0.8295, + "step": 3654, + "task_loss": 1.1561167240142822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5110570192337036, + "epoch": 3.09, + "learning_rate": 3.4551986475063394e-05, + "loss": 0.6884, + "step": 3655, + "task_loss": 1.1525349617004395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9352744817733765, + "epoch": 3.09, + "learning_rate": 3.454775993237532e-05, + "loss": 0.6429, + "step": 3656, + "task_loss": 1.1322050094604492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8701868057250977, + "epoch": 3.09, + "learning_rate": 3.454353338968724e-05, + "loss": 0.9323, + "step": 3657, + "task_loss": 0.6373240351676941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1272753477096558, + "epoch": 3.09, + "learning_rate": 3.453930684699916e-05, + "loss": 0.9596, + "step": 3658, + "task_loss": 1.3707433938980103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7567120790481567, + "epoch": 3.09, + "learning_rate": 3.453508030431107e-05, + "loss": 0.5409, + "step": 3659, + "task_loss": 0.6953011751174927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.862185001373291, + "epoch": 3.09, + "learning_rate": 3.453085376162299e-05, + "loss": 0.5595, + "step": 3660, + "task_loss": 0.5445569157600403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7042524814605713, + "epoch": 3.09, + "learning_rate": 3.452662721893491e-05, + "loss": 0.6456, + "step": 3661, + "task_loss": 1.1954067945480347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4417010545730591, + "epoch": 3.1, + "learning_rate": 3.452240067624683e-05, + "loss": 0.6821, + "step": 3662, + "task_loss": 0.9332767724990845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31428661942481995, + "epoch": 3.1, + "learning_rate": 3.451817413355875e-05, + "loss": 0.5747, + "step": 3663, + "task_loss": 1.1859930753707886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7423601746559143, + "epoch": 3.1, + "learning_rate": 3.451394759087067e-05, + "loss": 0.8787, + "step": 3664, + "task_loss": 1.6809523105621338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8902570605278015, + "epoch": 3.1, + "learning_rate": 3.4509721048182585e-05, + "loss": 0.7193, + "step": 3665, + "task_loss": 0.8307445645332336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4651380181312561, + "epoch": 3.1, + "learning_rate": 3.4505494505494505e-05, + "loss": 0.5203, + "step": 3666, + "task_loss": 0.2219117283821106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6209709644317627, + "epoch": 3.1, + "learning_rate": 3.4501267962806424e-05, + "loss": 0.6962, + "step": 3667, + "task_loss": 0.3096996545791626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4959348142147064, + "epoch": 3.1, + "learning_rate": 3.4497041420118344e-05, + "loss": 0.6595, + "step": 3668, + "task_loss": 0.23782256245613098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6249418258666992, + "epoch": 3.1, + "learning_rate": 3.4492814877430264e-05, + "loss": 0.5658, + "step": 3669, + "task_loss": 0.6621933579444885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2890488803386688, + "epoch": 3.1, + "learning_rate": 3.4488588334742184e-05, + "loss": 0.5579, + "step": 3670, + "task_loss": 0.8235982656478882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41222092509269714, + "epoch": 3.1, + "learning_rate": 3.4484361792054103e-05, + "loss": 0.4933, + "step": 3671, + "task_loss": 0.6372846961021423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47301530838012695, + "epoch": 3.1, + "learning_rate": 3.4480135249366016e-05, + "loss": 0.4614, + "step": 3672, + "task_loss": 0.31961125135421753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2152414321899414, + "epoch": 3.1, + "learning_rate": 3.447590870667794e-05, + "loss": 0.8295, + "step": 3673, + "task_loss": 0.8630499243736267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3054693937301636, + "epoch": 3.11, + "learning_rate": 3.447168216398986e-05, + "loss": 0.5383, + "step": 3674, + "task_loss": 0.2641279697418213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8957634568214417, + "epoch": 3.11, + "learning_rate": 3.4467455621301776e-05, + "loss": 0.7628, + "step": 3675, + "task_loss": 0.5650660991668701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3680400848388672, + "epoch": 3.11, + "learning_rate": 3.4463229078613695e-05, + "loss": 0.645, + "step": 3676, + "task_loss": 0.500686764717102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.710966944694519, + "epoch": 3.11, + "learning_rate": 3.4459002535925615e-05, + "loss": 0.6517, + "step": 3677, + "task_loss": 0.3076692223548889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8600115180015564, + "epoch": 3.11, + "learning_rate": 3.445477599323753e-05, + "loss": 0.6647, + "step": 3678, + "task_loss": 1.2308040857315063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7716382741928101, + "epoch": 3.11, + "learning_rate": 3.4450549450549455e-05, + "loss": 0.8439, + "step": 3679, + "task_loss": 1.227615475654602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7783896327018738, + "epoch": 3.11, + "learning_rate": 3.4446322907861374e-05, + "loss": 0.5951, + "step": 3680, + "task_loss": 1.0431450605392456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34108084440231323, + "epoch": 3.11, + "learning_rate": 3.444209636517329e-05, + "loss": 0.4639, + "step": 3681, + "task_loss": 0.2700452506542206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4224686026573181, + "epoch": 3.11, + "learning_rate": 3.443786982248521e-05, + "loss": 0.5596, + "step": 3682, + "task_loss": 0.5008390545845032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5877885222434998, + "epoch": 3.11, + "learning_rate": 3.443364327979713e-05, + "loss": 0.857, + "step": 3683, + "task_loss": 1.0885577201843262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37955549359321594, + "epoch": 3.11, + "learning_rate": 3.4429416737109046e-05, + "loss": 0.4394, + "step": 3684, + "task_loss": 0.3138817250728607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40056484937667847, + "epoch": 3.11, + "learning_rate": 3.4425190194420966e-05, + "loss": 0.5732, + "step": 3685, + "task_loss": 0.2762420177459717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46877509355545044, + "epoch": 3.12, + "learning_rate": 3.4420963651732886e-05, + "loss": 0.5995, + "step": 3686, + "task_loss": 0.6931828856468201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7388428449630737, + "epoch": 3.12, + "learning_rate": 3.4416737109044806e-05, + "loss": 0.6687, + "step": 3687, + "task_loss": 0.497245728969574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5269056558609009, + "epoch": 3.12, + "learning_rate": 3.441251056635672e-05, + "loss": 0.5859, + "step": 3688, + "task_loss": 0.15157519280910492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5469696521759033, + "epoch": 3.12, + "learning_rate": 3.440828402366864e-05, + "loss": 0.6409, + "step": 3689, + "task_loss": 0.6367945075035095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5809814929962158, + "epoch": 3.12, + "learning_rate": 3.4404057480980565e-05, + "loss": 0.7934, + "step": 3690, + "task_loss": 1.1469680070877075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7316672801971436, + "epoch": 3.12, + "learning_rate": 3.439983093829248e-05, + "loss": 0.5431, + "step": 3691, + "task_loss": 0.5061452984809875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6621488928794861, + "epoch": 3.12, + "learning_rate": 3.43956043956044e-05, + "loss": 0.4336, + "step": 3692, + "task_loss": 0.5856128931045532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1469172239303589, + "epoch": 3.12, + "learning_rate": 3.439137785291632e-05, + "loss": 0.7151, + "step": 3693, + "task_loss": 1.250531792640686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2513699233531952, + "epoch": 3.12, + "learning_rate": 3.438715131022823e-05, + "loss": 0.5464, + "step": 3694, + "task_loss": 0.03669927641749382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6313196420669556, + "epoch": 3.12, + "learning_rate": 3.438292476754015e-05, + "loss": 0.6615, + "step": 3695, + "task_loss": 1.24752676486969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9174197912216187, + "epoch": 3.12, + "learning_rate": 3.4378698224852077e-05, + "loss": 0.785, + "step": 3696, + "task_loss": 2.034080743789673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9295786023139954, + "epoch": 3.13, + "learning_rate": 3.437447168216399e-05, + "loss": 0.8877, + "step": 3697, + "task_loss": 2.1541543006896973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6636124849319458, + "epoch": 3.13, + "learning_rate": 3.437024513947591e-05, + "loss": 0.6729, + "step": 3698, + "task_loss": 0.8873884677886963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46804749965667725, + "epoch": 3.13, + "learning_rate": 3.436601859678783e-05, + "loss": 0.6133, + "step": 3699, + "task_loss": 0.32078617811203003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8805177807807922, + "epoch": 3.13, + "learning_rate": 3.436179205409975e-05, + "loss": 0.7605, + "step": 3700, + "task_loss": 0.886267364025116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.103912115097046, + "epoch": 3.13, + "learning_rate": 3.435756551141167e-05, + "loss": 0.8799, + "step": 3701, + "task_loss": 0.9798417687416077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0157362222671509, + "epoch": 3.13, + "learning_rate": 3.435333896872359e-05, + "loss": 0.5984, + "step": 3702, + "task_loss": 0.5364814400672913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29741010069847107, + "epoch": 3.13, + "learning_rate": 3.434911242603551e-05, + "loss": 0.7944, + "step": 3703, + "task_loss": 0.3277398645877838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7587305307388306, + "epoch": 3.13, + "learning_rate": 3.434488588334742e-05, + "loss": 0.7329, + "step": 3704, + "task_loss": 0.7631685733795166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7421884536743164, + "epoch": 3.13, + "learning_rate": 3.434065934065934e-05, + "loss": 0.8099, + "step": 3705, + "task_loss": 1.821908950805664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0236493349075317, + "epoch": 3.13, + "learning_rate": 3.433643279797126e-05, + "loss": 0.7777, + "step": 3706, + "task_loss": 0.8544322848320007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6677559018135071, + "epoch": 3.13, + "learning_rate": 3.433220625528318e-05, + "loss": 0.5537, + "step": 3707, + "task_loss": 0.9479256272315979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8967137932777405, + "epoch": 3.13, + "learning_rate": 3.43279797125951e-05, + "loss": 0.7185, + "step": 3708, + "task_loss": 0.4673652648925781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0994935035705566, + "epoch": 3.14, + "learning_rate": 3.432375316990702e-05, + "loss": 0.7953, + "step": 3709, + "task_loss": 1.1799334287643433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7766702175140381, + "epoch": 3.14, + "learning_rate": 3.431952662721893e-05, + "loss": 0.6018, + "step": 3710, + "task_loss": 1.365644097328186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7631234526634216, + "epoch": 3.14, + "learning_rate": 3.431530008453085e-05, + "loss": 0.6444, + "step": 3711, + "task_loss": 2.0571107864379883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6685872673988342, + "epoch": 3.14, + "learning_rate": 3.431107354184277e-05, + "loss": 0.5958, + "step": 3712, + "task_loss": 0.348898321390152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5137240290641785, + "epoch": 3.14, + "learning_rate": 3.430684699915469e-05, + "loss": 0.7048, + "step": 3713, + "task_loss": 0.8146230578422546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5841662287712097, + "epoch": 3.14, + "learning_rate": 3.430262045646661e-05, + "loss": 0.581, + "step": 3714, + "task_loss": 0.3389625549316406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.501213788986206, + "epoch": 3.14, + "learning_rate": 3.429839391377853e-05, + "loss": 0.6678, + "step": 3715, + "task_loss": 0.2915074825286865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.702324390411377, + "epoch": 3.14, + "learning_rate": 3.429416737109045e-05, + "loss": 0.5891, + "step": 3716, + "task_loss": 0.2192486971616745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8353461027145386, + "epoch": 3.14, + "learning_rate": 3.4289940828402364e-05, + "loss": 0.8046, + "step": 3717, + "task_loss": 1.577061653137207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9514777064323425, + "epoch": 3.14, + "learning_rate": 3.428571428571429e-05, + "loss": 0.787, + "step": 3718, + "task_loss": 1.2960538864135742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47294703125953674, + "epoch": 3.14, + "learning_rate": 3.428148774302621e-05, + "loss": 0.65, + "step": 3719, + "task_loss": 0.6880050301551819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41818463802337646, + "epoch": 3.14, + "learning_rate": 3.427726120033812e-05, + "loss": 0.5719, + "step": 3720, + "task_loss": 0.6273998022079468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.656136155128479, + "epoch": 3.15, + "learning_rate": 3.427303465765004e-05, + "loss": 0.4719, + "step": 3721, + "task_loss": 1.1080176830291748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4582267999649048, + "epoch": 3.15, + "learning_rate": 3.426880811496196e-05, + "loss": 0.7449, + "step": 3722, + "task_loss": 1.2070543766021729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6509822607040405, + "epoch": 3.15, + "learning_rate": 3.426458157227388e-05, + "loss": 0.5769, + "step": 3723, + "task_loss": 1.1445403099060059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8488290309906006, + "epoch": 3.15, + "learning_rate": 3.42603550295858e-05, + "loss": 0.7427, + "step": 3724, + "task_loss": 0.20830874145030975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8277235627174377, + "epoch": 3.15, + "learning_rate": 3.425612848689772e-05, + "loss": 0.8952, + "step": 3725, + "task_loss": 0.7268683314323425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9796358942985535, + "epoch": 3.15, + "learning_rate": 3.4251901944209635e-05, + "loss": 0.7004, + "step": 3726, + "task_loss": 0.7696370482444763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6433742046356201, + "epoch": 3.15, + "learning_rate": 3.4247675401521555e-05, + "loss": 0.6158, + "step": 3727, + "task_loss": 0.4722282588481903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5639978647232056, + "epoch": 3.15, + "learning_rate": 3.4243448858833474e-05, + "loss": 0.6768, + "step": 3728, + "task_loss": 0.49071016907691956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5041612386703491, + "epoch": 3.15, + "learning_rate": 3.4239222316145394e-05, + "loss": 0.5287, + "step": 3729, + "task_loss": 0.4251966178417206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4461835026741028, + "epoch": 3.15, + "learning_rate": 3.4234995773457314e-05, + "loss": 0.5905, + "step": 3730, + "task_loss": 0.896463930606842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0472036600112915, + "epoch": 3.15, + "learning_rate": 3.4230769230769234e-05, + "loss": 0.6551, + "step": 3731, + "task_loss": 0.7908580303192139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5743104219436646, + "epoch": 3.15, + "learning_rate": 3.422654268808115e-05, + "loss": 0.6424, + "step": 3732, + "task_loss": 0.5943659543991089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5084221363067627, + "epoch": 3.16, + "learning_rate": 3.4222316145393066e-05, + "loss": 0.7514, + "step": 3733, + "task_loss": 1.2541954517364502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5626847147941589, + "epoch": 3.16, + "learning_rate": 3.4218089602704986e-05, + "loss": 0.6204, + "step": 3734, + "task_loss": 0.8054807782173157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7527306079864502, + "epoch": 3.16, + "learning_rate": 3.421386306001691e-05, + "loss": 0.7216, + "step": 3735, + "task_loss": 1.3229390382766724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36895665526390076, + "epoch": 3.16, + "learning_rate": 3.4209636517328825e-05, + "loss": 0.5362, + "step": 3736, + "task_loss": 1.2897366285324097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6206696033477783, + "epoch": 3.16, + "learning_rate": 3.4205409974640745e-05, + "loss": 0.6655, + "step": 3737, + "task_loss": 0.749636173248291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6683449149131775, + "epoch": 3.16, + "learning_rate": 3.4201183431952665e-05, + "loss": 0.5945, + "step": 3738, + "task_loss": 1.2045040130615234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3047749996185303, + "epoch": 3.16, + "learning_rate": 3.419695688926458e-05, + "loss": 0.5077, + "step": 3739, + "task_loss": 0.3758259117603302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7239495515823364, + "epoch": 3.16, + "learning_rate": 3.4192730346576504e-05, + "loss": 0.572, + "step": 3740, + "task_loss": 1.604012370109558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8683885931968689, + "epoch": 3.16, + "learning_rate": 3.4188503803888424e-05, + "loss": 0.7127, + "step": 3741, + "task_loss": 0.8793639540672302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7540621757507324, + "epoch": 3.16, + "learning_rate": 3.418427726120034e-05, + "loss": 0.791, + "step": 3742, + "task_loss": 0.9764981865882874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7124232053756714, + "epoch": 3.16, + "learning_rate": 3.418005071851226e-05, + "loss": 0.6411, + "step": 3743, + "task_loss": 0.8723115921020508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6217201948165894, + "epoch": 3.16, + "learning_rate": 3.4175824175824177e-05, + "loss": 0.6833, + "step": 3744, + "task_loss": 1.2725998163223267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7923558354377747, + "epoch": 3.17, + "learning_rate": 3.4171597633136096e-05, + "loss": 0.6733, + "step": 3745, + "task_loss": 0.701529860496521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4069065451622009, + "epoch": 3.17, + "learning_rate": 3.4167371090448016e-05, + "loss": 0.8203, + "step": 3746, + "task_loss": 0.5882713794708252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7379698753356934, + "epoch": 3.17, + "learning_rate": 3.4163144547759936e-05, + "loss": 0.6767, + "step": 3747, + "task_loss": 0.5868172645568848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5990483164787292, + "epoch": 3.17, + "learning_rate": 3.4158918005071856e-05, + "loss": 0.6854, + "step": 3748, + "task_loss": 0.8165649771690369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7794822454452515, + "epoch": 3.17, + "learning_rate": 3.415469146238377e-05, + "loss": 0.9131, + "step": 3749, + "task_loss": 0.5788837671279907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.587905764579773, + "epoch": 3.17, + "learning_rate": 3.415046491969569e-05, + "loss": 0.5252, + "step": 3750, + "task_loss": 0.28707078099250793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3462509512901306, + "epoch": 3.17, + "learning_rate": 3.414623837700761e-05, + "loss": 0.5549, + "step": 3751, + "task_loss": 0.26485949754714966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36292195320129395, + "epoch": 3.17, + "learning_rate": 3.414201183431953e-05, + "loss": 0.5105, + "step": 3752, + "task_loss": 0.3786086142063141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6408847570419312, + "epoch": 3.17, + "learning_rate": 3.413778529163145e-05, + "loss": 0.6511, + "step": 3753, + "task_loss": 0.6492927074432373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45942166447639465, + "epoch": 3.17, + "learning_rate": 3.413355874894337e-05, + "loss": 0.6294, + "step": 3754, + "task_loss": 1.1160677671432495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5613938570022583, + "epoch": 3.17, + "learning_rate": 3.412933220625528e-05, + "loss": 0.6821, + "step": 3755, + "task_loss": 0.9854567050933838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3202049136161804, + "epoch": 3.17, + "learning_rate": 3.41251056635672e-05, + "loss": 0.6158, + "step": 3756, + "task_loss": 0.10007906705141068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4958118796348572, + "epoch": 3.18, + "learning_rate": 3.4120879120879126e-05, + "loss": 0.6717, + "step": 3757, + "task_loss": 0.8764507174491882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.493516743183136, + "epoch": 3.18, + "learning_rate": 3.4116652578191046e-05, + "loss": 0.5277, + "step": 3758, + "task_loss": 0.9716719388961792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5764971971511841, + "epoch": 3.18, + "learning_rate": 3.411242603550296e-05, + "loss": 0.6181, + "step": 3759, + "task_loss": 1.2201251983642578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8238059282302856, + "epoch": 3.18, + "learning_rate": 3.410819949281488e-05, + "loss": 0.8158, + "step": 3760, + "task_loss": 0.4241720139980316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4805119037628174, + "epoch": 3.18, + "learning_rate": 3.41039729501268e-05, + "loss": 0.6478, + "step": 3761, + "task_loss": 0.4531811475753784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5111087560653687, + "epoch": 3.18, + "learning_rate": 3.409974640743872e-05, + "loss": 0.5142, + "step": 3762, + "task_loss": 0.688961386680603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5467681288719177, + "epoch": 3.18, + "learning_rate": 3.409551986475064e-05, + "loss": 0.6499, + "step": 3763, + "task_loss": 0.31389936804771423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6697627305984497, + "epoch": 3.18, + "learning_rate": 3.409129332206256e-05, + "loss": 0.5545, + "step": 3764, + "task_loss": 1.1988121271133423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6300317645072937, + "epoch": 3.18, + "learning_rate": 3.408706677937447e-05, + "loss": 0.7127, + "step": 3765, + "task_loss": 0.39688840508461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5210208892822266, + "epoch": 3.18, + "learning_rate": 3.408284023668639e-05, + "loss": 0.5281, + "step": 3766, + "task_loss": 1.0124322175979614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8983035683631897, + "epoch": 3.18, + "learning_rate": 3.407861369399831e-05, + "loss": 0.6345, + "step": 3767, + "task_loss": 1.0718694925308228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6712415218353271, + "epoch": 3.19, + "learning_rate": 3.407438715131023e-05, + "loss": 0.4919, + "step": 3768, + "task_loss": 1.2112324237823486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9172640442848206, + "epoch": 3.19, + "learning_rate": 3.407016060862215e-05, + "loss": 0.7208, + "step": 3769, + "task_loss": 0.9262511134147644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7018153667449951, + "epoch": 3.19, + "learning_rate": 3.406593406593407e-05, + "loss": 0.792, + "step": 3770, + "task_loss": 0.3578963279724121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.744202196598053, + "epoch": 3.19, + "learning_rate": 3.406170752324598e-05, + "loss": 0.8204, + "step": 3771, + "task_loss": 1.2293626070022583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34972304105758667, + "epoch": 3.19, + "learning_rate": 3.40574809805579e-05, + "loss": 0.5734, + "step": 3772, + "task_loss": 0.7316970229148865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5744720697402954, + "epoch": 3.19, + "learning_rate": 3.405325443786982e-05, + "loss": 0.8727, + "step": 3773, + "task_loss": 1.7128212451934814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.555213212966919, + "epoch": 3.19, + "learning_rate": 3.404902789518175e-05, + "loss": 0.6076, + "step": 3774, + "task_loss": 1.0635849237442017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6892537474632263, + "epoch": 3.19, + "learning_rate": 3.404480135249366e-05, + "loss": 0.7379, + "step": 3775, + "task_loss": 0.959374725818634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40325430035591125, + "epoch": 3.19, + "learning_rate": 3.404057480980558e-05, + "loss": 0.6955, + "step": 3776, + "task_loss": 0.6035494804382324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4382266104221344, + "epoch": 3.19, + "learning_rate": 3.40363482671175e-05, + "loss": 0.5323, + "step": 3777, + "task_loss": 0.7823032736778259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6094955801963806, + "epoch": 3.19, + "learning_rate": 3.4032121724429414e-05, + "loss": 0.5933, + "step": 3778, + "task_loss": 0.29179009795188904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6489497423171997, + "epoch": 3.19, + "learning_rate": 3.402789518174134e-05, + "loss": 0.6234, + "step": 3779, + "task_loss": 0.5940364599227905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5587323307991028, + "epoch": 3.2, + "learning_rate": 3.402366863905326e-05, + "loss": 0.6553, + "step": 3780, + "task_loss": 1.3246991634368896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5941123962402344, + "epoch": 3.2, + "learning_rate": 3.401944209636517e-05, + "loss": 0.7707, + "step": 3781, + "task_loss": 0.47114717960357666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2677892446517944, + "epoch": 3.2, + "learning_rate": 3.401521555367709e-05, + "loss": 0.7269, + "step": 3782, + "task_loss": 2.1561338901519775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5927201509475708, + "epoch": 3.2, + "learning_rate": 3.401098901098901e-05, + "loss": 0.6243, + "step": 3783, + "task_loss": 0.8887702822685242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5167626142501831, + "epoch": 3.2, + "learning_rate": 3.400676246830093e-05, + "loss": 0.6644, + "step": 3784, + "task_loss": 0.4265687167644501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2229374647140503, + "epoch": 3.2, + "learning_rate": 3.400253592561285e-05, + "loss": 0.6521, + "step": 3785, + "task_loss": 1.2462905645370483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6373627185821533, + "epoch": 3.2, + "learning_rate": 3.399830938292477e-05, + "loss": 0.5816, + "step": 3786, + "task_loss": 0.30056482553482056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5628971457481384, + "epoch": 3.2, + "learning_rate": 3.399408284023669e-05, + "loss": 0.4788, + "step": 3787, + "task_loss": 0.7161725759506226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2736262381076813, + "epoch": 3.2, + "learning_rate": 3.3989856297548604e-05, + "loss": 0.6858, + "step": 3788, + "task_loss": 0.6323650479316711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6507004499435425, + "epoch": 3.2, + "learning_rate": 3.3985629754860524e-05, + "loss": 0.5559, + "step": 3789, + "task_loss": 0.8334919810295105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7830895781517029, + "epoch": 3.2, + "learning_rate": 3.3981403212172444e-05, + "loss": 0.7053, + "step": 3790, + "task_loss": 0.5140246748924255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5599095821380615, + "epoch": 3.2, + "learning_rate": 3.3977176669484364e-05, + "loss": 0.5615, + "step": 3791, + "task_loss": 0.7034794092178345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5156378746032715, + "epoch": 3.21, + "learning_rate": 3.397295012679628e-05, + "loss": 0.6541, + "step": 3792, + "task_loss": 0.47448819875717163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6824741959571838, + "epoch": 3.21, + "learning_rate": 3.39687235841082e-05, + "loss": 0.7199, + "step": 3793, + "task_loss": 1.0980820655822754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5753838419914246, + "epoch": 3.21, + "learning_rate": 3.3964497041420116e-05, + "loss": 0.6556, + "step": 3794, + "task_loss": 0.34962376952171326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7017492055892944, + "epoch": 3.21, + "learning_rate": 3.3960270498732036e-05, + "loss": 0.5851, + "step": 3795, + "task_loss": 1.6766815185546875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5991634130477905, + "epoch": 3.21, + "learning_rate": 3.395604395604396e-05, + "loss": 0.5041, + "step": 3796, + "task_loss": 0.90635085105896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5647245645523071, + "epoch": 3.21, + "learning_rate": 3.3951817413355875e-05, + "loss": 0.5586, + "step": 3797, + "task_loss": 0.7481592297554016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5071923732757568, + "epoch": 3.21, + "learning_rate": 3.3947590870667795e-05, + "loss": 0.6085, + "step": 3798, + "task_loss": 0.41574627161026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7173941135406494, + "epoch": 3.21, + "learning_rate": 3.3943364327979715e-05, + "loss": 0.8112, + "step": 3799, + "task_loss": 0.42481133341789246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9913312792778015, + "epoch": 3.21, + "learning_rate": 3.393913778529163e-05, + "loss": 0.7504, + "step": 3800, + "task_loss": 0.7806087136268616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6741906404495239, + "epoch": 3.21, + "learning_rate": 3.3934911242603554e-05, + "loss": 0.772, + "step": 3801, + "task_loss": 0.6874298453330994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39445626735687256, + "epoch": 3.21, + "learning_rate": 3.3930684699915474e-05, + "loss": 0.6312, + "step": 3802, + "task_loss": 0.8107923865318298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.776245653629303, + "epoch": 3.21, + "learning_rate": 3.3926458157227394e-05, + "loss": 0.6365, + "step": 3803, + "task_loss": 0.5789065957069397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6302706003189087, + "epoch": 3.22, + "learning_rate": 3.392223161453931e-05, + "loss": 0.7743, + "step": 3804, + "task_loss": 1.3697251081466675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6761274337768555, + "epoch": 3.22, + "learning_rate": 3.3918005071851226e-05, + "loss": 0.8965, + "step": 3805, + "task_loss": 0.5946615934371948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9160383939743042, + "epoch": 3.22, + "learning_rate": 3.3913778529163146e-05, + "loss": 0.6498, + "step": 3806, + "task_loss": 0.6681810617446899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45323342084884644, + "epoch": 3.22, + "learning_rate": 3.3909551986475066e-05, + "loss": 0.5182, + "step": 3807, + "task_loss": 0.9610430002212524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5408672094345093, + "epoch": 3.22, + "learning_rate": 3.3905325443786986e-05, + "loss": 0.5985, + "step": 3808, + "task_loss": 0.6297504901885986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.679121732711792, + "epoch": 3.22, + "learning_rate": 3.3901098901098905e-05, + "loss": 0.7845, + "step": 3809, + "task_loss": 0.3974277973175049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5405347347259521, + "epoch": 3.22, + "learning_rate": 3.389687235841082e-05, + "loss": 0.7444, + "step": 3810, + "task_loss": 0.9425486922264099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5738269090652466, + "epoch": 3.22, + "learning_rate": 3.389264581572274e-05, + "loss": 0.6617, + "step": 3811, + "task_loss": 1.0415279865264893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3114665150642395, + "epoch": 3.22, + "learning_rate": 3.388841927303466e-05, + "loss": 0.6941, + "step": 3812, + "task_loss": 0.24100200831890106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9643857479095459, + "epoch": 3.22, + "learning_rate": 3.388419273034658e-05, + "loss": 0.7422, + "step": 3813, + "task_loss": 0.9303734302520752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2126593589782715, + "epoch": 3.22, + "learning_rate": 3.38799661876585e-05, + "loss": 0.674, + "step": 3814, + "task_loss": 1.382112979888916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7923911809921265, + "epoch": 3.22, + "learning_rate": 3.387573964497042e-05, + "loss": 0.7357, + "step": 3815, + "task_loss": 2.0474979877471924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8287831544876099, + "epoch": 3.23, + "learning_rate": 3.387151310228234e-05, + "loss": 0.7787, + "step": 3816, + "task_loss": 0.9554091691970825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5383998155593872, + "epoch": 3.23, + "learning_rate": 3.386728655959425e-05, + "loss": 0.6112, + "step": 3817, + "task_loss": 0.900924801826477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49445056915283203, + "epoch": 3.23, + "learning_rate": 3.3863060016906176e-05, + "loss": 0.6665, + "step": 3818, + "task_loss": 0.993209719657898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.619189977645874, + "epoch": 3.23, + "learning_rate": 3.3858833474218096e-05, + "loss": 0.5328, + "step": 3819, + "task_loss": 0.649749219417572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8845866322517395, + "epoch": 3.23, + "learning_rate": 3.385460693153001e-05, + "loss": 0.7296, + "step": 3820, + "task_loss": 1.6628559827804565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.568413496017456, + "epoch": 3.23, + "learning_rate": 3.385038038884193e-05, + "loss": 0.7596, + "step": 3821, + "task_loss": 0.7251441478729248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8410848379135132, + "epoch": 3.23, + "learning_rate": 3.384615384615385e-05, + "loss": 0.6969, + "step": 3822, + "task_loss": 1.9016329050064087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6509816646575928, + "epoch": 3.23, + "learning_rate": 3.384192730346576e-05, + "loss": 0.5989, + "step": 3823, + "task_loss": 0.6418876051902771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43573498725891113, + "epoch": 3.23, + "learning_rate": 3.383770076077769e-05, + "loss": 0.6344, + "step": 3824, + "task_loss": 0.7386416792869568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5029503107070923, + "epoch": 3.23, + "learning_rate": 3.383347421808961e-05, + "loss": 0.6811, + "step": 3825, + "task_loss": 0.2797193229198456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.555972695350647, + "epoch": 3.23, + "learning_rate": 3.382924767540152e-05, + "loss": 0.5548, + "step": 3826, + "task_loss": 0.5922855138778687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9157826900482178, + "epoch": 3.23, + "learning_rate": 3.382502113271344e-05, + "loss": 0.717, + "step": 3827, + "task_loss": 0.3756215572357178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.687616229057312, + "epoch": 3.24, + "learning_rate": 3.382079459002536e-05, + "loss": 0.593, + "step": 3828, + "task_loss": 0.47909826040267944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36408451199531555, + "epoch": 3.24, + "learning_rate": 3.381656804733728e-05, + "loss": 0.5341, + "step": 3829, + "task_loss": 0.3287026286125183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44365108013153076, + "epoch": 3.24, + "learning_rate": 3.38123415046492e-05, + "loss": 0.6658, + "step": 3830, + "task_loss": 0.2332337200641632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40200281143188477, + "epoch": 3.24, + "learning_rate": 3.380811496196112e-05, + "loss": 0.5684, + "step": 3831, + "task_loss": 0.5256891846656799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4989927411079407, + "epoch": 3.24, + "learning_rate": 3.380388841927304e-05, + "loss": 0.5325, + "step": 3832, + "task_loss": 1.267116665840149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7334017157554626, + "epoch": 3.24, + "learning_rate": 3.379966187658495e-05, + "loss": 0.56, + "step": 3833, + "task_loss": 0.5534391403198242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4674106538295746, + "epoch": 3.24, + "learning_rate": 3.379543533389687e-05, + "loss": 0.6094, + "step": 3834, + "task_loss": 1.1380066871643066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5072059631347656, + "epoch": 3.24, + "learning_rate": 3.37912087912088e-05, + "loss": 0.5302, + "step": 3835, + "task_loss": 1.4863840341567993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6701310873031616, + "epoch": 3.24, + "learning_rate": 3.378698224852071e-05, + "loss": 0.645, + "step": 3836, + "task_loss": 0.9406919479370117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4624938368797302, + "epoch": 3.24, + "learning_rate": 3.378275570583263e-05, + "loss": 0.5949, + "step": 3837, + "task_loss": 0.7320272922515869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48286527395248413, + "epoch": 3.24, + "learning_rate": 3.377852916314455e-05, + "loss": 0.6212, + "step": 3838, + "task_loss": 0.4311377704143524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3031708896160126, + "epoch": 3.24, + "learning_rate": 3.3774302620456464e-05, + "loss": 0.7132, + "step": 3839, + "task_loss": 0.2939806282520294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35981088876724243, + "epoch": 3.25, + "learning_rate": 3.377007607776838e-05, + "loss": 0.4392, + "step": 3840, + "task_loss": 0.8133130669593811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8086020350456238, + "epoch": 3.25, + "learning_rate": 3.376584953508031e-05, + "loss": 0.6996, + "step": 3841, + "task_loss": 1.0303314924240112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5664933919906616, + "epoch": 3.25, + "learning_rate": 3.376162299239222e-05, + "loss": 0.5265, + "step": 3842, + "task_loss": 0.5581521987915039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.596442699432373, + "epoch": 3.25, + "learning_rate": 3.375739644970414e-05, + "loss": 0.9495, + "step": 3843, + "task_loss": 1.3157193660736084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45182183384895325, + "epoch": 3.25, + "learning_rate": 3.375316990701606e-05, + "loss": 0.6561, + "step": 3844, + "task_loss": 0.8621368408203125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5898299813270569, + "epoch": 3.25, + "learning_rate": 3.374894336432798e-05, + "loss": 0.6432, + "step": 3845, + "task_loss": 0.6100964546203613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4014677405357361, + "epoch": 3.25, + "learning_rate": 3.37447168216399e-05, + "loss": 0.5069, + "step": 3846, + "task_loss": 0.11786511540412903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8498715162277222, + "epoch": 3.25, + "learning_rate": 3.374049027895182e-05, + "loss": 0.9378, + "step": 3847, + "task_loss": 0.8390326499938965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6927406787872314, + "epoch": 3.25, + "learning_rate": 3.373626373626374e-05, + "loss": 0.8032, + "step": 3848, + "task_loss": 0.8705059289932251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7547593712806702, + "epoch": 3.25, + "learning_rate": 3.3732037193575654e-05, + "loss": 0.7301, + "step": 3849, + "task_loss": 1.809188961982727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3883437514305115, + "epoch": 3.25, + "learning_rate": 3.3727810650887574e-05, + "loss": 0.488, + "step": 3850, + "task_loss": 0.07945055514574051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9190082550048828, + "epoch": 3.26, + "learning_rate": 3.3723584108199494e-05, + "loss": 0.5127, + "step": 3851, + "task_loss": 0.8374632596969604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5903159379959106, + "epoch": 3.26, + "learning_rate": 3.3719357565511413e-05, + "loss": 0.7062, + "step": 3852, + "task_loss": 0.6006965041160583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4534945785999298, + "epoch": 3.26, + "learning_rate": 3.371513102282333e-05, + "loss": 0.4788, + "step": 3853, + "task_loss": 0.24314014613628387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5452814102172852, + "epoch": 3.26, + "learning_rate": 3.371090448013525e-05, + "loss": 0.8426, + "step": 3854, + "task_loss": 0.9421284794807434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9109559059143066, + "epoch": 3.26, + "learning_rate": 3.3706677937447166e-05, + "loss": 0.698, + "step": 3855, + "task_loss": 1.0641039609909058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4101806879043579, + "epoch": 3.26, + "learning_rate": 3.3702451394759086e-05, + "loss": 0.5488, + "step": 3856, + "task_loss": 1.55475652217865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3298285007476807, + "epoch": 3.26, + "learning_rate": 3.3698224852071005e-05, + "loss": 0.6337, + "step": 3857, + "task_loss": 0.8242889046669006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7067632675170898, + "epoch": 3.26, + "learning_rate": 3.3693998309382925e-05, + "loss": 0.6156, + "step": 3858, + "task_loss": 1.3002690076828003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6895588636398315, + "epoch": 3.26, + "learning_rate": 3.3689771766694845e-05, + "loss": 0.5168, + "step": 3859, + "task_loss": 0.4281080663204193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4891388416290283, + "epoch": 3.26, + "learning_rate": 3.3685545224006765e-05, + "loss": 0.633, + "step": 3860, + "task_loss": 0.5239247679710388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0667812824249268, + "epoch": 3.26, + "learning_rate": 3.3681318681318684e-05, + "loss": 0.724, + "step": 3861, + "task_loss": 1.9683423042297363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44475460052490234, + "epoch": 3.26, + "learning_rate": 3.36770921386306e-05, + "loss": 0.6531, + "step": 3862, + "task_loss": 0.3908093571662903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4851522445678711, + "epoch": 3.27, + "learning_rate": 3.3672865595942524e-05, + "loss": 0.6904, + "step": 3863, + "task_loss": 0.46170082688331604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3814062774181366, + "epoch": 3.27, + "learning_rate": 3.3668639053254444e-05, + "loss": 0.4736, + "step": 3864, + "task_loss": 0.6820520758628845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5738285183906555, + "epoch": 3.27, + "learning_rate": 3.3664412510566357e-05, + "loss": 0.5907, + "step": 3865, + "task_loss": 0.7286311388015747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48386770486831665, + "epoch": 3.27, + "learning_rate": 3.3660185967878276e-05, + "loss": 0.5048, + "step": 3866, + "task_loss": 0.5828297138214111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.026790976524353, + "epoch": 3.27, + "learning_rate": 3.3655959425190196e-05, + "loss": 0.8365, + "step": 3867, + "task_loss": 0.7935538291931152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38497093319892883, + "epoch": 3.27, + "learning_rate": 3.3651732882502116e-05, + "loss": 0.4633, + "step": 3868, + "task_loss": 0.2598867416381836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9358500838279724, + "epoch": 3.27, + "learning_rate": 3.3647506339814035e-05, + "loss": 0.6402, + "step": 3869, + "task_loss": 1.0280195474624634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6167775988578796, + "epoch": 3.27, + "learning_rate": 3.3643279797125955e-05, + "loss": 0.7177, + "step": 3870, + "task_loss": 0.9789909720420837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3970627188682556, + "epoch": 3.27, + "learning_rate": 3.363905325443787e-05, + "loss": 0.6239, + "step": 3871, + "task_loss": 1.4325599670410156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9278029203414917, + "epoch": 3.27, + "learning_rate": 3.363482671174979e-05, + "loss": 0.6757, + "step": 3872, + "task_loss": 1.0422202348709106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1656203269958496, + "epoch": 3.27, + "learning_rate": 3.363060016906171e-05, + "loss": 0.8446, + "step": 3873, + "task_loss": 1.0028610229492188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.553748607635498, + "epoch": 3.27, + "learning_rate": 3.362637362637363e-05, + "loss": 0.4543, + "step": 3874, + "task_loss": 0.4587751030921936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45819446444511414, + "epoch": 3.28, + "learning_rate": 3.362214708368555e-05, + "loss": 0.7845, + "step": 3875, + "task_loss": 0.7192927598953247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7986859679222107, + "epoch": 3.28, + "learning_rate": 3.361792054099747e-05, + "loss": 0.6381, + "step": 3876, + "task_loss": 0.9620726704597473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6307942867279053, + "epoch": 3.28, + "learning_rate": 3.3613693998309387e-05, + "loss": 0.8671, + "step": 3877, + "task_loss": 0.48219799995422363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6521656513214111, + "epoch": 3.28, + "learning_rate": 3.36094674556213e-05, + "loss": 0.5479, + "step": 3878, + "task_loss": 1.2433865070343018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0597290992736816, + "epoch": 3.28, + "learning_rate": 3.360524091293322e-05, + "loss": 0.7366, + "step": 3879, + "task_loss": 0.36986494064331055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6747319102287292, + "epoch": 3.28, + "learning_rate": 3.3601014370245146e-05, + "loss": 0.6707, + "step": 3880, + "task_loss": 2.446615219116211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.581082820892334, + "epoch": 3.28, + "learning_rate": 3.359678782755706e-05, + "loss": 0.6838, + "step": 3881, + "task_loss": 0.8097153306007385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49912330508232117, + "epoch": 3.28, + "learning_rate": 3.359256128486898e-05, + "loss": 0.595, + "step": 3882, + "task_loss": 0.3019488751888275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4520947337150574, + "epoch": 3.28, + "learning_rate": 3.35883347421809e-05, + "loss": 0.4635, + "step": 3883, + "task_loss": 0.552654504776001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5838029384613037, + "epoch": 3.28, + "learning_rate": 3.358410819949281e-05, + "loss": 0.6597, + "step": 3884, + "task_loss": 1.3630002737045288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31268203258514404, + "epoch": 3.28, + "learning_rate": 3.357988165680474e-05, + "loss": 0.5935, + "step": 3885, + "task_loss": 0.06699737161397934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7134374380111694, + "epoch": 3.28, + "learning_rate": 3.357565511411666e-05, + "loss": 0.719, + "step": 3886, + "task_loss": 0.5727769136428833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9589802622795105, + "epoch": 3.29, + "learning_rate": 3.357142857142857e-05, + "loss": 0.7402, + "step": 3887, + "task_loss": 0.7933779954910278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9160937070846558, + "epoch": 3.29, + "learning_rate": 3.356720202874049e-05, + "loss": 0.6912, + "step": 3888, + "task_loss": 1.1651965379714966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5411211252212524, + "epoch": 3.29, + "learning_rate": 3.356297548605241e-05, + "loss": 0.7335, + "step": 3889, + "task_loss": 1.470801591873169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3677292764186859, + "epoch": 3.29, + "learning_rate": 3.355874894336433e-05, + "loss": 0.6632, + "step": 3890, + "task_loss": 0.8832083344459534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41046008467674255, + "epoch": 3.29, + "learning_rate": 3.355452240067625e-05, + "loss": 0.6883, + "step": 3891, + "task_loss": 0.33027833700180054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49018967151641846, + "epoch": 3.29, + "learning_rate": 3.355029585798817e-05, + "loss": 0.554, + "step": 3892, + "task_loss": 0.7048411965370178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7225145101547241, + "epoch": 3.29, + "learning_rate": 3.354606931530009e-05, + "loss": 0.548, + "step": 3893, + "task_loss": 1.3113082647323608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42464256286621094, + "epoch": 3.29, + "learning_rate": 3.3541842772612e-05, + "loss": 0.6451, + "step": 3894, + "task_loss": 0.5623482465744019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5087553262710571, + "epoch": 3.29, + "learning_rate": 3.353761622992392e-05, + "loss": 0.5388, + "step": 3895, + "task_loss": 0.5432758331298828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4386586546897888, + "epoch": 3.29, + "learning_rate": 3.353338968723584e-05, + "loss": 0.6313, + "step": 3896, + "task_loss": 0.6271265745162964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9487965106964111, + "epoch": 3.29, + "learning_rate": 3.352916314454776e-05, + "loss": 0.8178, + "step": 3897, + "task_loss": 1.5867424011230469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34835946559906006, + "epoch": 3.29, + "learning_rate": 3.352493660185968e-05, + "loss": 0.637, + "step": 3898, + "task_loss": 1.0247727632522583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4320606291294098, + "epoch": 3.3, + "learning_rate": 3.35207100591716e-05, + "loss": 0.6995, + "step": 3899, + "task_loss": 1.3939924240112305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49219024181365967, + "epoch": 3.3, + "learning_rate": 3.3516483516483513e-05, + "loss": 0.6035, + "step": 3900, + "task_loss": 1.4756388664245605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3515661060810089, + "epoch": 3.3, + "learning_rate": 3.351225697379543e-05, + "loss": 0.4611, + "step": 3901, + "task_loss": 0.5603144764900208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0203819274902344, + "epoch": 3.3, + "learning_rate": 3.350803043110736e-05, + "loss": 0.7224, + "step": 3902, + "task_loss": 1.6708731651306152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4209211766719818, + "epoch": 3.3, + "learning_rate": 3.350380388841928e-05, + "loss": 0.631, + "step": 3903, + "task_loss": 1.2993181943893433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7542859315872192, + "epoch": 3.3, + "learning_rate": 3.349957734573119e-05, + "loss": 0.6682, + "step": 3904, + "task_loss": 1.2208454608917236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5419905185699463, + "epoch": 3.3, + "learning_rate": 3.349535080304311e-05, + "loss": 0.5774, + "step": 3905, + "task_loss": 0.4915432035923004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7790452241897583, + "epoch": 3.3, + "learning_rate": 3.349112426035503e-05, + "loss": 0.6745, + "step": 3906, + "task_loss": 0.6758826375007629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7102470993995667, + "epoch": 3.3, + "learning_rate": 3.348689771766695e-05, + "loss": 0.723, + "step": 3907, + "task_loss": 1.0424365997314453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4168453514575958, + "epoch": 3.3, + "learning_rate": 3.348267117497887e-05, + "loss": 0.7707, + "step": 3908, + "task_loss": 0.6616557240486145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5598769783973694, + "epoch": 3.3, + "learning_rate": 3.347844463229079e-05, + "loss": 0.5769, + "step": 3909, + "task_loss": 0.7414443492889404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8472377061843872, + "epoch": 3.3, + "learning_rate": 3.3474218089602704e-05, + "loss": 0.8324, + "step": 3910, + "task_loss": 0.6906710863113403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22270134091377258, + "epoch": 3.31, + "learning_rate": 3.3469991546914624e-05, + "loss": 0.6163, + "step": 3911, + "task_loss": 0.08123055845499039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7566246390342712, + "epoch": 3.31, + "learning_rate": 3.3465765004226544e-05, + "loss": 0.8558, + "step": 3912, + "task_loss": 0.8362683057785034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43627476692199707, + "epoch": 3.31, + "learning_rate": 3.346153846153846e-05, + "loss": 0.5194, + "step": 3913, + "task_loss": 0.7641618847846985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35899990797042847, + "epoch": 3.31, + "learning_rate": 3.345731191885038e-05, + "loss": 0.5453, + "step": 3914, + "task_loss": 0.14782261848449707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5322375297546387, + "epoch": 3.31, + "learning_rate": 3.34530853761623e-05, + "loss": 0.4527, + "step": 3915, + "task_loss": 0.708094596862793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7115599513053894, + "epoch": 3.31, + "learning_rate": 3.3448858833474216e-05, + "loss": 0.4722, + "step": 3916, + "task_loss": 1.097058892250061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4993571937084198, + "epoch": 3.31, + "learning_rate": 3.3444632290786135e-05, + "loss": 0.7615, + "step": 3917, + "task_loss": 0.5412318706512451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6557318568229675, + "epoch": 3.31, + "learning_rate": 3.3440405748098055e-05, + "loss": 0.703, + "step": 3918, + "task_loss": 0.5193818807601929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5269460082054138, + "epoch": 3.31, + "learning_rate": 3.343617920540998e-05, + "loss": 0.5943, + "step": 3919, + "task_loss": 0.9898266792297363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3797948956489563, + "epoch": 3.31, + "learning_rate": 3.3431952662721895e-05, + "loss": 0.6485, + "step": 3920, + "task_loss": 0.37862321734428406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0981862545013428, + "epoch": 3.31, + "learning_rate": 3.3427726120033814e-05, + "loss": 1.0162, + "step": 3921, + "task_loss": 0.2785789966583252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6004927158355713, + "epoch": 3.32, + "learning_rate": 3.3423499577345734e-05, + "loss": 0.6426, + "step": 3922, + "task_loss": 1.1608461141586304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7121838331222534, + "epoch": 3.32, + "learning_rate": 3.341927303465765e-05, + "loss": 0.7206, + "step": 3923, + "task_loss": 1.717772364616394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8762622475624084, + "epoch": 3.32, + "learning_rate": 3.3415046491969574e-05, + "loss": 0.6832, + "step": 3924, + "task_loss": 2.074665069580078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30707815289497375, + "epoch": 3.32, + "learning_rate": 3.341081994928149e-05, + "loss": 0.7001, + "step": 3925, + "task_loss": 0.7765181064605713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6167467832565308, + "epoch": 3.32, + "learning_rate": 3.3406593406593406e-05, + "loss": 0.6838, + "step": 3926, + "task_loss": 0.53131103515625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44397321343421936, + "epoch": 3.32, + "learning_rate": 3.3402366863905326e-05, + "loss": 0.5643, + "step": 3927, + "task_loss": 0.3047277331352234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3708931803703308, + "epoch": 3.32, + "learning_rate": 3.3398140321217246e-05, + "loss": 0.4935, + "step": 3928, + "task_loss": 0.38013702630996704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8187675476074219, + "epoch": 3.32, + "learning_rate": 3.3393913778529166e-05, + "loss": 0.6628, + "step": 3929, + "task_loss": 0.5122108459472656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.649563193321228, + "epoch": 3.32, + "learning_rate": 3.3389687235841085e-05, + "loss": 0.7425, + "step": 3930, + "task_loss": 0.8000655770301819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26284122467041016, + "epoch": 3.32, + "learning_rate": 3.3385460693153005e-05, + "loss": 0.5002, + "step": 3931, + "task_loss": 0.17851315438747406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6204005479812622, + "epoch": 3.32, + "learning_rate": 3.3381234150464925e-05, + "loss": 0.641, + "step": 3932, + "task_loss": 0.3291212022304535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.682673454284668, + "epoch": 3.32, + "learning_rate": 3.337700760777684e-05, + "loss": 0.6991, + "step": 3933, + "task_loss": 1.032178521156311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6322749853134155, + "epoch": 3.33, + "learning_rate": 3.337278106508876e-05, + "loss": 0.5521, + "step": 3934, + "task_loss": 0.7650724649429321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6840622425079346, + "epoch": 3.33, + "learning_rate": 3.336855452240068e-05, + "loss": 0.7621, + "step": 3935, + "task_loss": 0.6678985953330994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5300444960594177, + "epoch": 3.33, + "learning_rate": 3.33643279797126e-05, + "loss": 0.7967, + "step": 3936, + "task_loss": 1.1937285661697388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41465961933135986, + "epoch": 3.33, + "learning_rate": 3.336010143702452e-05, + "loss": 0.5593, + "step": 3937, + "task_loss": 0.2977862060070038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5342230796813965, + "epoch": 3.33, + "learning_rate": 3.3355874894336436e-05, + "loss": 0.6051, + "step": 3938, + "task_loss": 0.7423573732376099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4787595272064209, + "epoch": 3.33, + "learning_rate": 3.335164835164835e-05, + "loss": 0.4094, + "step": 3939, + "task_loss": 0.6922428607940674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6558927893638611, + "epoch": 3.33, + "learning_rate": 3.334742180896027e-05, + "loss": 0.71, + "step": 3940, + "task_loss": 0.267610102891922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42038998007774353, + "epoch": 3.33, + "learning_rate": 3.3343195266272196e-05, + "loss": 0.7359, + "step": 3941, + "task_loss": 0.824495792388916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7075364589691162, + "epoch": 3.33, + "learning_rate": 3.333896872358411e-05, + "loss": 0.6182, + "step": 3942, + "task_loss": 1.3665658235549927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49442434310913086, + "epoch": 3.33, + "learning_rate": 3.333474218089603e-05, + "loss": 0.4703, + "step": 3943, + "task_loss": 0.14843980967998505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37423110008239746, + "epoch": 3.33, + "learning_rate": 3.333051563820795e-05, + "loss": 0.491, + "step": 3944, + "task_loss": 1.5335365533828735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34091874957084656, + "epoch": 3.33, + "learning_rate": 3.332628909551986e-05, + "loss": 0.5345, + "step": 3945, + "task_loss": 1.0860202312469482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6558480858802795, + "epoch": 3.34, + "learning_rate": 3.332206255283179e-05, + "loss": 0.6812, + "step": 3946, + "task_loss": 0.796873927116394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6395263075828552, + "epoch": 3.34, + "learning_rate": 3.331783601014371e-05, + "loss": 0.5819, + "step": 3947, + "task_loss": 0.4183849096298218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49171847105026245, + "epoch": 3.34, + "learning_rate": 3.331360946745563e-05, + "loss": 0.575, + "step": 3948, + "task_loss": 0.684720516204834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.578411340713501, + "epoch": 3.34, + "learning_rate": 3.330938292476754e-05, + "loss": 0.6977, + "step": 3949, + "task_loss": 0.5862065553665161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4539857506752014, + "epoch": 3.34, + "learning_rate": 3.330515638207946e-05, + "loss": 0.778, + "step": 3950, + "task_loss": 0.4472378194332123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3729032278060913, + "epoch": 3.34, + "learning_rate": 3.330092983939138e-05, + "loss": 0.2935, + "step": 3951, + "task_loss": 0.6985999941825867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0239622592926025, + "epoch": 3.34, + "learning_rate": 3.32967032967033e-05, + "loss": 0.6693, + "step": 3952, + "task_loss": 1.0546976327896118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7052303552627563, + "epoch": 3.34, + "learning_rate": 3.329247675401522e-05, + "loss": 0.7982, + "step": 3953, + "task_loss": 0.3319380283355713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8787857294082642, + "epoch": 3.34, + "learning_rate": 3.328825021132714e-05, + "loss": 0.6226, + "step": 3954, + "task_loss": 0.36139535903930664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6386686563491821, + "epoch": 3.34, + "learning_rate": 3.328402366863905e-05, + "loss": 0.6821, + "step": 3955, + "task_loss": 1.0463898181915283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5190571546554565, + "epoch": 3.34, + "learning_rate": 3.327979712595097e-05, + "loss": 0.7957, + "step": 3956, + "task_loss": 0.6291264295578003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5444065928459167, + "epoch": 3.34, + "learning_rate": 3.327557058326289e-05, + "loss": 0.6873, + "step": 3957, + "task_loss": 0.7291675806045532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5625632405281067, + "epoch": 3.35, + "learning_rate": 3.327134404057481e-05, + "loss": 0.6632, + "step": 3958, + "task_loss": 0.9873447418212891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5395931005477905, + "epoch": 3.35, + "learning_rate": 3.326711749788673e-05, + "loss": 0.7347, + "step": 3959, + "task_loss": 0.2736806273460388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4505186975002289, + "epoch": 3.35, + "learning_rate": 3.326289095519865e-05, + "loss": 0.5042, + "step": 3960, + "task_loss": 0.051546353846788406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6125847101211548, + "epoch": 3.35, + "learning_rate": 3.325866441251057e-05, + "loss": 0.7314, + "step": 3961, + "task_loss": 0.5110401511192322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6178805232048035, + "epoch": 3.35, + "learning_rate": 3.325443786982248e-05, + "loss": 0.5301, + "step": 3962, + "task_loss": 0.6961491703987122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3957834839820862, + "epoch": 3.35, + "learning_rate": 3.325021132713441e-05, + "loss": 0.5786, + "step": 3963, + "task_loss": 1.042910099029541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5367882251739502, + "epoch": 3.35, + "learning_rate": 3.324598478444633e-05, + "loss": 0.6244, + "step": 3964, + "task_loss": 0.8038404583930969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9926853775978088, + "epoch": 3.35, + "learning_rate": 3.324175824175824e-05, + "loss": 0.6856, + "step": 3965, + "task_loss": 0.7768417000770569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9465579986572266, + "epoch": 3.35, + "learning_rate": 3.323753169907016e-05, + "loss": 0.5704, + "step": 3966, + "task_loss": 0.8470343351364136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9539152383804321, + "epoch": 3.35, + "learning_rate": 3.323330515638208e-05, + "loss": 0.611, + "step": 3967, + "task_loss": 1.830016851425171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4759204089641571, + "epoch": 3.35, + "learning_rate": 3.3229078613693995e-05, + "loss": 0.6842, + "step": 3968, + "task_loss": 1.0724446773529053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7830750942230225, + "epoch": 3.35, + "learning_rate": 3.322485207100592e-05, + "loss": 0.724, + "step": 3969, + "task_loss": 1.2209429740905762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5183069109916687, + "epoch": 3.36, + "learning_rate": 3.322062552831784e-05, + "loss": 0.5557, + "step": 3970, + "task_loss": 0.08554243296384811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7506176233291626, + "epoch": 3.36, + "learning_rate": 3.3216398985629754e-05, + "loss": 0.7632, + "step": 3971, + "task_loss": 1.3869366645812988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7739876508712769, + "epoch": 3.36, + "learning_rate": 3.3212172442941674e-05, + "loss": 0.652, + "step": 3972, + "task_loss": 1.096614956855774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46949848532676697, + "epoch": 3.36, + "learning_rate": 3.3207945900253593e-05, + "loss": 0.6517, + "step": 3973, + "task_loss": 0.4659460186958313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7188059091567993, + "epoch": 3.36, + "learning_rate": 3.320371935756551e-05, + "loss": 0.5684, + "step": 3974, + "task_loss": 1.0373523235321045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.312154620885849, + "epoch": 3.36, + "learning_rate": 3.319949281487743e-05, + "loss": 0.5088, + "step": 3975, + "task_loss": 0.7555842399597168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8637068271636963, + "epoch": 3.36, + "learning_rate": 3.319526627218935e-05, + "loss": 0.8029, + "step": 3976, + "task_loss": 0.5204150676727295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7043997049331665, + "epoch": 3.36, + "learning_rate": 3.319103972950127e-05, + "loss": 0.6963, + "step": 3977, + "task_loss": 1.1344348192214966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5696607232093811, + "epoch": 3.36, + "learning_rate": 3.3186813186813185e-05, + "loss": 0.7128, + "step": 3978, + "task_loss": 0.496666818857193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5712813138961792, + "epoch": 3.36, + "learning_rate": 3.3182586644125105e-05, + "loss": 0.6678, + "step": 3979, + "task_loss": 0.6548811197280884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4539666473865509, + "epoch": 3.36, + "learning_rate": 3.317836010143703e-05, + "loss": 0.6589, + "step": 3980, + "task_loss": 0.21650294959545135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44822144508361816, + "epoch": 3.36, + "learning_rate": 3.3174133558748945e-05, + "loss": 0.6086, + "step": 3981, + "task_loss": 0.14732351899147034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4881581664085388, + "epoch": 3.37, + "learning_rate": 3.3169907016060864e-05, + "loss": 0.4735, + "step": 3982, + "task_loss": 0.2254583090543747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5408546924591064, + "epoch": 3.37, + "learning_rate": 3.3165680473372784e-05, + "loss": 0.658, + "step": 3983, + "task_loss": 0.8050487637519836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.81545090675354, + "epoch": 3.37, + "learning_rate": 3.31614539306847e-05, + "loss": 0.72, + "step": 3984, + "task_loss": 2.734854221343994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8490267992019653, + "epoch": 3.37, + "learning_rate": 3.315722738799662e-05, + "loss": 0.7237, + "step": 3985, + "task_loss": 0.630405843257904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5908295512199402, + "epoch": 3.37, + "learning_rate": 3.315300084530854e-05, + "loss": 0.5789, + "step": 3986, + "task_loss": 1.5488214492797852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5881787538528442, + "epoch": 3.37, + "learning_rate": 3.3148774302620456e-05, + "loss": 0.5039, + "step": 3987, + "task_loss": 0.4973008930683136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4499143660068512, + "epoch": 3.37, + "learning_rate": 3.3144547759932376e-05, + "loss": 0.6232, + "step": 3988, + "task_loss": 0.30243533849716187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5703656673431396, + "epoch": 3.37, + "learning_rate": 3.3140321217244296e-05, + "loss": 0.5047, + "step": 3989, + "task_loss": 0.8970152735710144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8677539825439453, + "epoch": 3.37, + "learning_rate": 3.3136094674556215e-05, + "loss": 0.8253, + "step": 3990, + "task_loss": 0.9921838641166687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0659618377685547, + "epoch": 3.37, + "learning_rate": 3.3131868131868135e-05, + "loss": 0.7674, + "step": 3991, + "task_loss": 0.7682436108589172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7066988945007324, + "epoch": 3.37, + "learning_rate": 3.3127641589180055e-05, + "loss": 0.6248, + "step": 3992, + "task_loss": 0.6049737930297852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7294638752937317, + "epoch": 3.38, + "learning_rate": 3.3123415046491975e-05, + "loss": 0.6554, + "step": 3993, + "task_loss": 0.4263124465942383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38612285256385803, + "epoch": 3.38, + "learning_rate": 3.311918850380389e-05, + "loss": 0.6898, + "step": 3994, + "task_loss": 0.1594415307044983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6931972503662109, + "epoch": 3.38, + "learning_rate": 3.311496196111581e-05, + "loss": 0.5274, + "step": 3995, + "task_loss": 1.0477615594863892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7712715864181519, + "epoch": 3.38, + "learning_rate": 3.311073541842773e-05, + "loss": 0.7052, + "step": 3996, + "task_loss": 0.6277380585670471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8982274532318115, + "epoch": 3.38, + "learning_rate": 3.310650887573965e-05, + "loss": 0.6824, + "step": 3997, + "task_loss": 1.4021918773651123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5366221070289612, + "epoch": 3.38, + "learning_rate": 3.3102282333051567e-05, + "loss": 0.7017, + "step": 3998, + "task_loss": 0.7381435632705688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9963749647140503, + "epoch": 3.38, + "learning_rate": 3.3098055790363486e-05, + "loss": 0.7746, + "step": 3999, + "task_loss": 0.5824484825134277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3503539264202118, + "epoch": 3.38, + "learning_rate": 3.30938292476754e-05, + "loss": 0.4668, + "step": 4000, + "task_loss": 0.7994530200958252 + }, + { + "epoch": 3.38, + "eval_accuracy": 0.8938217821782178, + "eval_loss": 0.40639403462409973, + "eval_runtime": 229.9162, + "eval_samples_per_second": 109.823, + "eval_steps_per_second": 0.861, + "step": 4000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6074734330177307, + "epoch": 3.38, + "learning_rate": 3.308960270498732e-05, + "loss": 0.628, + "step": 4001, + "task_loss": 0.7503520846366882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.092028021812439, + "epoch": 3.38, + "learning_rate": 3.308537616229924e-05, + "loss": 0.6776, + "step": 4002, + "task_loss": 1.0173532962799072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.865687370300293, + "epoch": 3.38, + "learning_rate": 3.308114961961116e-05, + "loss": 0.5595, + "step": 4003, + "task_loss": 1.0401179790496826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5781496167182922, + "epoch": 3.38, + "learning_rate": 3.307692307692308e-05, + "loss": 0.4993, + "step": 4004, + "task_loss": 0.6292144060134888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.077748417854309, + "epoch": 3.39, + "learning_rate": 3.3072696534235e-05, + "loss": 0.6869, + "step": 4005, + "task_loss": 0.8932000994682312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6122187972068787, + "epoch": 3.39, + "learning_rate": 3.306846999154692e-05, + "loss": 0.5102, + "step": 4006, + "task_loss": 0.7503175735473633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5466530323028564, + "epoch": 3.39, + "learning_rate": 3.306424344885883e-05, + "loss": 0.8431, + "step": 4007, + "task_loss": 0.5734150409698486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7588575482368469, + "epoch": 3.39, + "learning_rate": 3.306001690617076e-05, + "loss": 0.6754, + "step": 4008, + "task_loss": 0.8582702279090881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.607350766658783, + "epoch": 3.39, + "learning_rate": 3.305579036348268e-05, + "loss": 0.6199, + "step": 4009, + "task_loss": 0.5086784958839417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4584023356437683, + "epoch": 3.39, + "learning_rate": 3.305156382079459e-05, + "loss": 0.5779, + "step": 4010, + "task_loss": 0.5685446262359619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5342463850975037, + "epoch": 3.39, + "learning_rate": 3.304733727810651e-05, + "loss": 0.534, + "step": 4011, + "task_loss": 0.6923866868019104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3588854968547821, + "epoch": 3.39, + "learning_rate": 3.304311073541843e-05, + "loss": 0.4724, + "step": 4012, + "task_loss": 1.0098603963851929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6950870752334595, + "epoch": 3.39, + "learning_rate": 3.303888419273035e-05, + "loss": 0.7341, + "step": 4013, + "task_loss": 2.0473179817199707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5922557711601257, + "epoch": 3.39, + "learning_rate": 3.303465765004227e-05, + "loss": 0.6832, + "step": 4014, + "task_loss": 0.5787793397903442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6741911172866821, + "epoch": 3.39, + "learning_rate": 3.303043110735419e-05, + "loss": 0.6301, + "step": 4015, + "task_loss": 0.7569441199302673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6243327856063843, + "epoch": 3.39, + "learning_rate": 3.30262045646661e-05, + "loss": 0.5631, + "step": 4016, + "task_loss": 0.8289541602134705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5141885280609131, + "epoch": 3.4, + "learning_rate": 3.302197802197802e-05, + "loss": 0.6306, + "step": 4017, + "task_loss": 0.2160712033510208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7404823303222656, + "epoch": 3.4, + "learning_rate": 3.301775147928994e-05, + "loss": 0.7163, + "step": 4018, + "task_loss": 0.659570574760437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49293071031570435, + "epoch": 3.4, + "learning_rate": 3.301352493660186e-05, + "loss": 0.7344, + "step": 4019, + "task_loss": 0.9586378335952759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8711064457893372, + "epoch": 3.4, + "learning_rate": 3.300929839391378e-05, + "loss": 0.6297, + "step": 4020, + "task_loss": 0.40594664216041565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6690309047698975, + "epoch": 3.4, + "learning_rate": 3.30050718512257e-05, + "loss": 0.6973, + "step": 4021, + "task_loss": 0.5502723455429077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5152108669281006, + "epoch": 3.4, + "learning_rate": 3.300084530853762e-05, + "loss": 0.8188, + "step": 4022, + "task_loss": 0.34748587012290955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5054956674575806, + "epoch": 3.4, + "learning_rate": 3.299661876584953e-05, + "loss": 0.6089, + "step": 4023, + "task_loss": 0.2677193880081177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.916710615158081, + "epoch": 3.4, + "learning_rate": 3.299239222316145e-05, + "loss": 0.7993, + "step": 4024, + "task_loss": 0.7530816197395325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.424642950296402, + "epoch": 3.4, + "learning_rate": 3.298816568047338e-05, + "loss": 0.6699, + "step": 4025, + "task_loss": 0.5044595003128052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5814586877822876, + "epoch": 3.4, + "learning_rate": 3.298393913778529e-05, + "loss": 0.6204, + "step": 4026, + "task_loss": 0.7369222640991211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5689500570297241, + "epoch": 3.4, + "learning_rate": 3.297971259509721e-05, + "loss": 0.8235, + "step": 4027, + "task_loss": 1.223319172859192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8197453022003174, + "epoch": 3.4, + "learning_rate": 3.297548605240913e-05, + "loss": 0.7091, + "step": 4028, + "task_loss": 1.4007569551467896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6885471343994141, + "epoch": 3.41, + "learning_rate": 3.2971259509721045e-05, + "loss": 0.5376, + "step": 4029, + "task_loss": 0.9437877535820007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49165111780166626, + "epoch": 3.41, + "learning_rate": 3.296703296703297e-05, + "loss": 0.6809, + "step": 4030, + "task_loss": 0.46014800667762756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6432880163192749, + "epoch": 3.41, + "learning_rate": 3.296280642434489e-05, + "loss": 0.7491, + "step": 4031, + "task_loss": 0.4910712242126465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1444894075393677, + "epoch": 3.41, + "learning_rate": 3.2958579881656804e-05, + "loss": 0.7569, + "step": 4032, + "task_loss": 0.9607234001159668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48879674077033997, + "epoch": 3.41, + "learning_rate": 3.2954353338968724e-05, + "loss": 0.5373, + "step": 4033, + "task_loss": 0.32777366042137146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.555480420589447, + "epoch": 3.41, + "learning_rate": 3.295012679628064e-05, + "loss": 0.5601, + "step": 4034, + "task_loss": 1.6399238109588623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7140830159187317, + "epoch": 3.41, + "learning_rate": 3.294590025359256e-05, + "loss": 0.7321, + "step": 4035, + "task_loss": 0.9857349395751953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8306131362915039, + "epoch": 3.41, + "learning_rate": 3.294167371090448e-05, + "loss": 0.5821, + "step": 4036, + "task_loss": 1.1293448209762573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6218990087509155, + "epoch": 3.41, + "learning_rate": 3.29374471682164e-05, + "loss": 0.6024, + "step": 4037, + "task_loss": 0.30195924639701843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9194777011871338, + "epoch": 3.41, + "learning_rate": 3.293322062552832e-05, + "loss": 0.8282, + "step": 4038, + "task_loss": 1.2538440227508545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36070582270622253, + "epoch": 3.41, + "learning_rate": 3.2928994082840235e-05, + "loss": 0.6345, + "step": 4039, + "task_loss": 0.9978319406509399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2422235906124115, + "epoch": 3.41, + "learning_rate": 3.2924767540152155e-05, + "loss": 0.4458, + "step": 4040, + "task_loss": 0.45990389585494995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.826747477054596, + "epoch": 3.42, + "learning_rate": 3.2920540997464075e-05, + "loss": 0.6636, + "step": 4041, + "task_loss": 1.0456844568252563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39871031045913696, + "epoch": 3.42, + "learning_rate": 3.2916314454775994e-05, + "loss": 0.642, + "step": 4042, + "task_loss": 0.4895150065422058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33663707971572876, + "epoch": 3.42, + "learning_rate": 3.2912087912087914e-05, + "loss": 0.7944, + "step": 4043, + "task_loss": 0.8684514760971069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.614155650138855, + "epoch": 3.42, + "learning_rate": 3.2907861369399834e-05, + "loss": 0.539, + "step": 4044, + "task_loss": 0.10875343531370163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6658488512039185, + "epoch": 3.42, + "learning_rate": 3.290363482671175e-05, + "loss": 0.6066, + "step": 4045, + "task_loss": 0.1566886603832245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6465568542480469, + "epoch": 3.42, + "learning_rate": 3.2899408284023667e-05, + "loss": 0.5563, + "step": 4046, + "task_loss": 1.0074589252471924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5670353770256042, + "epoch": 3.42, + "learning_rate": 3.289518174133559e-05, + "loss": 0.6195, + "step": 4047, + "task_loss": 0.7071921825408936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4479367733001709, + "epoch": 3.42, + "learning_rate": 3.289095519864751e-05, + "loss": 0.5335, + "step": 4048, + "task_loss": 0.3439406752586365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6772239208221436, + "epoch": 3.42, + "learning_rate": 3.2886728655959426e-05, + "loss": 0.6037, + "step": 4049, + "task_loss": 0.45465779304504395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6022642254829407, + "epoch": 3.42, + "learning_rate": 3.2882502113271346e-05, + "loss": 0.6563, + "step": 4050, + "task_loss": 1.0175983905792236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7284107208251953, + "epoch": 3.42, + "learning_rate": 3.2878275570583265e-05, + "loss": 0.728, + "step": 4051, + "task_loss": 1.5928486585617065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5204176902770996, + "epoch": 3.42, + "learning_rate": 3.2874049027895185e-05, + "loss": 0.6054, + "step": 4052, + "task_loss": 0.53810054063797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.628588855266571, + "epoch": 3.43, + "learning_rate": 3.2869822485207105e-05, + "loss": 0.6286, + "step": 4053, + "task_loss": 0.6430641412734985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2044486999511719, + "epoch": 3.43, + "learning_rate": 3.2865595942519024e-05, + "loss": 0.6788, + "step": 4054, + "task_loss": 0.9367953538894653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4022917151451111, + "epoch": 3.43, + "learning_rate": 3.286136939983094e-05, + "loss": 0.6369, + "step": 4055, + "task_loss": 0.5606754422187805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5345295667648315, + "epoch": 3.43, + "learning_rate": 3.285714285714286e-05, + "loss": 0.7832, + "step": 4056, + "task_loss": 0.8172202706336975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4476451873779297, + "epoch": 3.43, + "learning_rate": 3.285291631445478e-05, + "loss": 0.4829, + "step": 4057, + "task_loss": 0.5786000490188599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.250101089477539, + "epoch": 3.43, + "learning_rate": 3.28486897717667e-05, + "loss": 0.6598, + "step": 4058, + "task_loss": 0.8442792296409607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3972112238407135, + "epoch": 3.43, + "learning_rate": 3.2844463229078616e-05, + "loss": 0.5063, + "step": 4059, + "task_loss": 0.8798096179962158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41983237862586975, + "epoch": 3.43, + "learning_rate": 3.2840236686390536e-05, + "loss": 0.5357, + "step": 4060, + "task_loss": 0.4424249231815338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4093361794948578, + "epoch": 3.43, + "learning_rate": 3.283601014370245e-05, + "loss": 0.5427, + "step": 4061, + "task_loss": 0.25447678565979004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3815116286277771, + "epoch": 3.43, + "learning_rate": 3.283178360101437e-05, + "loss": 0.7215, + "step": 4062, + "task_loss": 0.7735743522644043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0876439809799194, + "epoch": 3.43, + "learning_rate": 3.282755705832629e-05, + "loss": 0.8521, + "step": 4063, + "task_loss": 0.8553889989852905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47514504194259644, + "epoch": 3.44, + "learning_rate": 3.2823330515638215e-05, + "loss": 0.748, + "step": 4064, + "task_loss": 0.2769138514995575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7730213403701782, + "epoch": 3.44, + "learning_rate": 3.281910397295013e-05, + "loss": 0.5825, + "step": 4065, + "task_loss": 0.704494833946228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6209992170333862, + "epoch": 3.44, + "learning_rate": 3.281487743026205e-05, + "loss": 0.5109, + "step": 4066, + "task_loss": 0.7182151675224304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6192642450332642, + "epoch": 3.44, + "learning_rate": 3.281065088757397e-05, + "loss": 0.5397, + "step": 4067, + "task_loss": 1.446255087852478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2995608150959015, + "epoch": 3.44, + "learning_rate": 3.280642434488588e-05, + "loss": 0.6474, + "step": 4068, + "task_loss": 0.5294321179389954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6721882820129395, + "epoch": 3.44, + "learning_rate": 3.280219780219781e-05, + "loss": 0.5631, + "step": 4069, + "task_loss": 0.2013845592737198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6690796613693237, + "epoch": 3.44, + "learning_rate": 3.279797125950973e-05, + "loss": 0.6919, + "step": 4070, + "task_loss": 0.8039246201515198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7115398049354553, + "epoch": 3.44, + "learning_rate": 3.279374471682164e-05, + "loss": 0.6749, + "step": 4071, + "task_loss": 1.9238874912261963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41662031412124634, + "epoch": 3.44, + "learning_rate": 3.278951817413356e-05, + "loss": 0.5418, + "step": 4072, + "task_loss": 1.279392123222351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6019200682640076, + "epoch": 3.44, + "learning_rate": 3.278529163144548e-05, + "loss": 0.531, + "step": 4073, + "task_loss": 0.6550692915916443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7717655897140503, + "epoch": 3.44, + "learning_rate": 3.27810650887574e-05, + "loss": 0.5532, + "step": 4074, + "task_loss": 0.8793482184410095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5803868174552917, + "epoch": 3.44, + "learning_rate": 3.277683854606932e-05, + "loss": 0.6325, + "step": 4075, + "task_loss": 1.3391324281692505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29155969619750977, + "epoch": 3.45, + "learning_rate": 3.277261200338124e-05, + "loss": 0.3528, + "step": 4076, + "task_loss": 0.1384962648153305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8540109992027283, + "epoch": 3.45, + "learning_rate": 3.276838546069316e-05, + "loss": 0.5834, + "step": 4077, + "task_loss": 0.9989869594573975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5168101191520691, + "epoch": 3.45, + "learning_rate": 3.276415891800507e-05, + "loss": 0.5211, + "step": 4078, + "task_loss": 0.8434225916862488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43424078822135925, + "epoch": 3.45, + "learning_rate": 3.275993237531699e-05, + "loss": 0.578, + "step": 4079, + "task_loss": 0.5573917627334595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5935831069946289, + "epoch": 3.45, + "learning_rate": 3.275570583262891e-05, + "loss": 0.7061, + "step": 4080, + "task_loss": 0.1950499415397644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5144053101539612, + "epoch": 3.45, + "learning_rate": 3.275147928994083e-05, + "loss": 0.6545, + "step": 4081, + "task_loss": 1.4214547872543335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9065748453140259, + "epoch": 3.45, + "learning_rate": 3.274725274725275e-05, + "loss": 0.8582, + "step": 4082, + "task_loss": 1.4179044961929321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4021805226802826, + "epoch": 3.45, + "learning_rate": 3.274302620456467e-05, + "loss": 0.6486, + "step": 4083, + "task_loss": 0.3329828381538391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6248587369918823, + "epoch": 3.45, + "learning_rate": 3.273879966187658e-05, + "loss": 0.8051, + "step": 4084, + "task_loss": 0.6296215653419495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5132864117622375, + "epoch": 3.45, + "learning_rate": 3.27345731191885e-05, + "loss": 0.4438, + "step": 4085, + "task_loss": 0.9903122186660767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34761959314346313, + "epoch": 3.45, + "learning_rate": 3.273034657650043e-05, + "loss": 0.557, + "step": 4086, + "task_loss": 0.6588301658630371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7301135063171387, + "epoch": 3.45, + "learning_rate": 3.272612003381234e-05, + "loss": 0.5641, + "step": 4087, + "task_loss": 0.5226346850395203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6535298824310303, + "epoch": 3.46, + "learning_rate": 3.272189349112426e-05, + "loss": 0.9017, + "step": 4088, + "task_loss": 0.8846993446350098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6151911020278931, + "epoch": 3.46, + "learning_rate": 3.271766694843618e-05, + "loss": 0.4643, + "step": 4089, + "task_loss": 0.8631864786148071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6418994069099426, + "epoch": 3.46, + "learning_rate": 3.2713440405748094e-05, + "loss": 0.6072, + "step": 4090, + "task_loss": 1.6342202425003052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6189375519752502, + "epoch": 3.46, + "learning_rate": 3.270921386306002e-05, + "loss": 0.7721, + "step": 4091, + "task_loss": 0.5247688889503479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5908918380737305, + "epoch": 3.46, + "learning_rate": 3.270498732037194e-05, + "loss": 0.802, + "step": 4092, + "task_loss": 0.6192498803138733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35019874572753906, + "epoch": 3.46, + "learning_rate": 3.270076077768386e-05, + "loss": 0.5409, + "step": 4093, + "task_loss": 0.3839492201805115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27196240425109863, + "epoch": 3.46, + "learning_rate": 3.269653423499577e-05, + "loss": 0.4499, + "step": 4094, + "task_loss": 0.21329045295715332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5424904823303223, + "epoch": 3.46, + "learning_rate": 3.269230769230769e-05, + "loss": 0.636, + "step": 4095, + "task_loss": 1.7122716903686523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3017902374267578, + "epoch": 3.46, + "learning_rate": 3.268808114961961e-05, + "loss": 0.6089, + "step": 4096, + "task_loss": 0.2441330999135971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43357884883880615, + "epoch": 3.46, + "learning_rate": 3.268385460693153e-05, + "loss": 0.5503, + "step": 4097, + "task_loss": 0.25708678364753723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4622652232646942, + "epoch": 3.46, + "learning_rate": 3.267962806424345e-05, + "loss": 0.4943, + "step": 4098, + "task_loss": 0.5686721801757812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1767756938934326, + "epoch": 3.46, + "learning_rate": 3.267540152155537e-05, + "loss": 0.6719, + "step": 4099, + "task_loss": 0.7881876230239868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4500119686126709, + "epoch": 3.47, + "learning_rate": 3.2671174978867285e-05, + "loss": 0.5455, + "step": 4100, + "task_loss": 0.3744850754737854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7269495725631714, + "epoch": 3.47, + "learning_rate": 3.2666948436179205e-05, + "loss": 0.7929, + "step": 4101, + "task_loss": 0.3960084021091461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6521552801132202, + "epoch": 3.47, + "learning_rate": 3.2662721893491124e-05, + "loss": 0.5437, + "step": 4102, + "task_loss": 0.45776593685150146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5061033368110657, + "epoch": 3.47, + "learning_rate": 3.2658495350803044e-05, + "loss": 0.6641, + "step": 4103, + "task_loss": 1.0900051593780518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7579696178436279, + "epoch": 3.47, + "learning_rate": 3.2654268808114964e-05, + "loss": 0.6505, + "step": 4104, + "task_loss": 1.058566927909851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6288526654243469, + "epoch": 3.47, + "learning_rate": 3.2650042265426884e-05, + "loss": 0.6373, + "step": 4105, + "task_loss": 0.5247631072998047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5674715042114258, + "epoch": 3.47, + "learning_rate": 3.2645815722738803e-05, + "loss": 0.6245, + "step": 4106, + "task_loss": 0.6395894289016724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37211471796035767, + "epoch": 3.47, + "learning_rate": 3.2641589180050716e-05, + "loss": 0.5863, + "step": 4107, + "task_loss": 0.8074463605880737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8475644588470459, + "epoch": 3.47, + "learning_rate": 3.263736263736264e-05, + "loss": 0.5912, + "step": 4108, + "task_loss": 0.6117957234382629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.014394760131836, + "epoch": 3.47, + "learning_rate": 3.263313609467456e-05, + "loss": 0.6077, + "step": 4109, + "task_loss": 0.7403123378753662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5096998810768127, + "epoch": 3.47, + "learning_rate": 3.2628909551986476e-05, + "loss": 0.4936, + "step": 4110, + "task_loss": 0.5338015556335449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7407160997390747, + "epoch": 3.47, + "learning_rate": 3.2624683009298395e-05, + "loss": 0.7413, + "step": 4111, + "task_loss": 0.42836934328079224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6188712120056152, + "epoch": 3.48, + "learning_rate": 3.2620456466610315e-05, + "loss": 0.5681, + "step": 4112, + "task_loss": 0.625019907951355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5124812126159668, + "epoch": 3.48, + "learning_rate": 3.261622992392223e-05, + "loss": 0.4905, + "step": 4113, + "task_loss": 0.9299064874649048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3338650166988373, + "epoch": 3.48, + "learning_rate": 3.2612003381234155e-05, + "loss": 0.5029, + "step": 4114, + "task_loss": 0.22932808101177216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5770798921585083, + "epoch": 3.48, + "learning_rate": 3.2607776838546074e-05, + "loss": 0.5792, + "step": 4115, + "task_loss": 0.17936843633651733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7312511801719666, + "epoch": 3.48, + "learning_rate": 3.260355029585799e-05, + "loss": 0.6715, + "step": 4116, + "task_loss": 1.1348507404327393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4917176067829132, + "epoch": 3.48, + "learning_rate": 3.259932375316991e-05, + "loss": 0.5181, + "step": 4117, + "task_loss": 0.6452078223228455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6049743890762329, + "epoch": 3.48, + "learning_rate": 3.259509721048183e-05, + "loss": 0.7414, + "step": 4118, + "task_loss": 0.9584172368049622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4489871561527252, + "epoch": 3.48, + "learning_rate": 3.2590870667793746e-05, + "loss": 0.4471, + "step": 4119, + "task_loss": 1.2409334182739258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24057623744010925, + "epoch": 3.48, + "learning_rate": 3.2586644125105666e-05, + "loss": 0.5333, + "step": 4120, + "task_loss": 0.9440165758132935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4532517194747925, + "epoch": 3.48, + "learning_rate": 3.2582417582417586e-05, + "loss": 0.854, + "step": 4121, + "task_loss": 0.6270748376846313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8020159602165222, + "epoch": 3.48, + "learning_rate": 3.2578191039729506e-05, + "loss": 0.6152, + "step": 4122, + "task_loss": 0.526718020439148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6842496395111084, + "epoch": 3.48, + "learning_rate": 3.257396449704142e-05, + "loss": 0.6181, + "step": 4123, + "task_loss": 0.6086697578430176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5249884128570557, + "epoch": 3.49, + "learning_rate": 3.256973795435334e-05, + "loss": 0.6199, + "step": 4124, + "task_loss": 0.33074983954429626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.051375150680542, + "epoch": 3.49, + "learning_rate": 3.2565511411665265e-05, + "loss": 0.6707, + "step": 4125, + "task_loss": 0.9608281850814819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5947024822235107, + "epoch": 3.49, + "learning_rate": 3.256128486897718e-05, + "loss": 0.5817, + "step": 4126, + "task_loss": 1.190467119216919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5592034459114075, + "epoch": 3.49, + "learning_rate": 3.25570583262891e-05, + "loss": 0.6001, + "step": 4127, + "task_loss": 0.2505715489387512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6960796117782593, + "epoch": 3.49, + "learning_rate": 3.255283178360102e-05, + "loss": 0.5672, + "step": 4128, + "task_loss": 0.39553961157798767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3609251976013184, + "epoch": 3.49, + "learning_rate": 3.254860524091293e-05, + "loss": 0.8422, + "step": 4129, + "task_loss": 0.9063063263893127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.734248161315918, + "epoch": 3.49, + "learning_rate": 3.254437869822485e-05, + "loss": 0.6301, + "step": 4130, + "task_loss": 0.5371847152709961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7986608743667603, + "epoch": 3.49, + "learning_rate": 3.2540152155536777e-05, + "loss": 0.5571, + "step": 4131, + "task_loss": 1.0364172458648682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7163912057876587, + "epoch": 3.49, + "learning_rate": 3.253592561284869e-05, + "loss": 0.4599, + "step": 4132, + "task_loss": 0.8118235468864441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6304093599319458, + "epoch": 3.49, + "learning_rate": 3.253169907016061e-05, + "loss": 0.6808, + "step": 4133, + "task_loss": 0.6577768325805664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7390733957290649, + "epoch": 3.49, + "learning_rate": 3.252747252747253e-05, + "loss": 0.7871, + "step": 4134, + "task_loss": 0.5037205815315247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43962255120277405, + "epoch": 3.5, + "learning_rate": 3.252324598478445e-05, + "loss": 0.6081, + "step": 4135, + "task_loss": 0.6094099283218384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47717249393463135, + "epoch": 3.5, + "learning_rate": 3.251901944209637e-05, + "loss": 0.5113, + "step": 4136, + "task_loss": 0.7854862809181213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6500998139381409, + "epoch": 3.5, + "learning_rate": 3.251479289940829e-05, + "loss": 0.6567, + "step": 4137, + "task_loss": 0.5980322360992432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4689990282058716, + "epoch": 3.5, + "learning_rate": 3.251056635672021e-05, + "loss": 0.6206, + "step": 4138, + "task_loss": 0.18691451847553253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6646602153778076, + "epoch": 3.5, + "learning_rate": 3.250633981403212e-05, + "loss": 0.6217, + "step": 4139, + "task_loss": 0.484631210565567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5594324469566345, + "epoch": 3.5, + "learning_rate": 3.250211327134404e-05, + "loss": 0.9303, + "step": 4140, + "task_loss": 0.44872263073921204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0829566717147827, + "epoch": 3.5, + "learning_rate": 3.249788672865596e-05, + "loss": 0.7187, + "step": 4141, + "task_loss": 0.9525862336158752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7319719791412354, + "epoch": 3.5, + "learning_rate": 3.249366018596788e-05, + "loss": 0.6668, + "step": 4142, + "task_loss": 1.2924727201461792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48716580867767334, + "epoch": 3.5, + "learning_rate": 3.24894336432798e-05, + "loss": 0.5509, + "step": 4143, + "task_loss": 0.06743727624416351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5892049670219421, + "epoch": 3.5, + "learning_rate": 3.248520710059172e-05, + "loss": 0.6057, + "step": 4144, + "task_loss": 0.7050666213035583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5103580355644226, + "epoch": 3.5, + "learning_rate": 3.248098055790363e-05, + "loss": 0.478, + "step": 4145, + "task_loss": 0.5845856070518494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5487319827079773, + "epoch": 3.5, + "learning_rate": 3.247675401521555e-05, + "loss": 0.6386, + "step": 4146, + "task_loss": 0.9110307097434998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8138513565063477, + "epoch": 3.51, + "learning_rate": 3.247252747252747e-05, + "loss": 0.6887, + "step": 4147, + "task_loss": 0.7805132865905762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5571469068527222, + "epoch": 3.51, + "learning_rate": 3.246830092983939e-05, + "loss": 0.6094, + "step": 4148, + "task_loss": 0.672821044921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7280246019363403, + "epoch": 3.51, + "learning_rate": 3.246407438715131e-05, + "loss": 0.523, + "step": 4149, + "task_loss": 0.2927163243293762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7121900320053101, + "epoch": 3.51, + "learning_rate": 3.245984784446323e-05, + "loss": 0.7706, + "step": 4150, + "task_loss": 1.4445549249649048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5459108948707581, + "epoch": 3.51, + "learning_rate": 3.245562130177515e-05, + "loss": 0.7939, + "step": 4151, + "task_loss": 0.28922948241233826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7208449840545654, + "epoch": 3.51, + "learning_rate": 3.2451394759087064e-05, + "loss": 0.7225, + "step": 4152, + "task_loss": 0.22648075222969055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5601734519004822, + "epoch": 3.51, + "learning_rate": 3.244716821639899e-05, + "loss": 0.6278, + "step": 4153, + "task_loss": 1.0026342868804932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6091493368148804, + "epoch": 3.51, + "learning_rate": 3.244294167371091e-05, + "loss": 0.6187, + "step": 4154, + "task_loss": 0.5586197972297668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4073311686515808, + "epoch": 3.51, + "learning_rate": 3.243871513102282e-05, + "loss": 0.4921, + "step": 4155, + "task_loss": 0.3951167166233063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5530438423156738, + "epoch": 3.51, + "learning_rate": 3.243448858833474e-05, + "loss": 0.5994, + "step": 4156, + "task_loss": 0.595047116279602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7245615124702454, + "epoch": 3.51, + "learning_rate": 3.243026204564666e-05, + "loss": 0.5904, + "step": 4157, + "task_loss": 0.5793409943580627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6666686534881592, + "epoch": 3.51, + "learning_rate": 3.242603550295858e-05, + "loss": 0.707, + "step": 4158, + "task_loss": 1.3579907417297363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6762040853500366, + "epoch": 3.52, + "learning_rate": 3.24218089602705e-05, + "loss": 0.5523, + "step": 4159, + "task_loss": 0.7119670510292053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9305746555328369, + "epoch": 3.52, + "learning_rate": 3.241758241758242e-05, + "loss": 0.6398, + "step": 4160, + "task_loss": 1.2407652139663696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6372056603431702, + "epoch": 3.52, + "learning_rate": 3.2413355874894335e-05, + "loss": 0.6543, + "step": 4161, + "task_loss": 1.2462353706359863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9108688235282898, + "epoch": 3.52, + "learning_rate": 3.2409129332206255e-05, + "loss": 0.6203, + "step": 4162, + "task_loss": 1.2412667274475098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4291374385356903, + "epoch": 3.52, + "learning_rate": 3.2404902789518174e-05, + "loss": 0.5399, + "step": 4163, + "task_loss": 0.767941415309906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.5653200149536133, + "epoch": 3.52, + "learning_rate": 3.2400676246830094e-05, + "loss": 0.9669, + "step": 4164, + "task_loss": 1.1558754444122314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5701494216918945, + "epoch": 3.52, + "learning_rate": 3.2396449704142014e-05, + "loss": 0.6029, + "step": 4165, + "task_loss": 1.1915533542633057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3344026207923889, + "epoch": 3.52, + "learning_rate": 3.2392223161453934e-05, + "loss": 0.5296, + "step": 4166, + "task_loss": 0.39103928208351135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6021039485931396, + "epoch": 3.52, + "learning_rate": 3.238799661876585e-05, + "loss": 0.7829, + "step": 4167, + "task_loss": 1.1129626035690308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5124154090881348, + "epoch": 3.52, + "learning_rate": 3.2383770076077766e-05, + "loss": 0.6512, + "step": 4168, + "task_loss": 0.40392643213272095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.407939612865448, + "epoch": 3.52, + "learning_rate": 3.2379543533389686e-05, + "loss": 0.4337, + "step": 4169, + "task_loss": 0.09862037003040314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6697307825088501, + "epoch": 3.52, + "learning_rate": 3.237531699070161e-05, + "loss": 0.5467, + "step": 4170, + "task_loss": 0.7681911587715149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5678291320800781, + "epoch": 3.53, + "learning_rate": 3.2371090448013525e-05, + "loss": 0.5375, + "step": 4171, + "task_loss": 0.2743608057498932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8003935813903809, + "epoch": 3.53, + "learning_rate": 3.2366863905325445e-05, + "loss": 0.562, + "step": 4172, + "task_loss": 1.7369859218597412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8253204822540283, + "epoch": 3.53, + "learning_rate": 3.2362637362637365e-05, + "loss": 0.814, + "step": 4173, + "task_loss": 2.1610679626464844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7380273342132568, + "epoch": 3.53, + "learning_rate": 3.235841081994928e-05, + "loss": 0.6779, + "step": 4174, + "task_loss": 1.8532426357269287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42812979221343994, + "epoch": 3.53, + "learning_rate": 3.2354184277261204e-05, + "loss": 0.566, + "step": 4175, + "task_loss": 0.7260921001434326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7297577857971191, + "epoch": 3.53, + "learning_rate": 3.2349957734573124e-05, + "loss": 0.6703, + "step": 4176, + "task_loss": 0.7097510099411011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5491771697998047, + "epoch": 3.53, + "learning_rate": 3.234573119188504e-05, + "loss": 0.6603, + "step": 4177, + "task_loss": 0.992276668548584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35628795623779297, + "epoch": 3.53, + "learning_rate": 3.234150464919696e-05, + "loss": 0.5347, + "step": 4178, + "task_loss": 0.4289785623550415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7322530746459961, + "epoch": 3.53, + "learning_rate": 3.2337278106508877e-05, + "loss": 0.5781, + "step": 4179, + "task_loss": 0.947498083114624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.655799150466919, + "epoch": 3.53, + "learning_rate": 3.2333051563820796e-05, + "loss": 0.5234, + "step": 4180, + "task_loss": 0.5578104257583618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7058315277099609, + "epoch": 3.53, + "learning_rate": 3.2328825021132716e-05, + "loss": 0.7592, + "step": 4181, + "task_loss": 1.0126543045043945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6310428380966187, + "epoch": 3.53, + "learning_rate": 3.2324598478444636e-05, + "loss": 0.6544, + "step": 4182, + "task_loss": 1.3184713125228882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4339359700679779, + "epoch": 3.54, + "learning_rate": 3.2320371935756556e-05, + "loss": 0.6474, + "step": 4183, + "task_loss": 0.09060013294219971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6512573957443237, + "epoch": 3.54, + "learning_rate": 3.231614539306847e-05, + "loss": 0.5401, + "step": 4184, + "task_loss": 0.8618577718734741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5656225085258484, + "epoch": 3.54, + "learning_rate": 3.231191885038039e-05, + "loss": 0.568, + "step": 4185, + "task_loss": 1.2756853103637695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6193535923957825, + "epoch": 3.54, + "learning_rate": 3.230769230769231e-05, + "loss": 0.566, + "step": 4186, + "task_loss": 0.8474246263504028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7238597869873047, + "epoch": 3.54, + "learning_rate": 3.230346576500423e-05, + "loss": 0.5672, + "step": 4187, + "task_loss": 0.5749729871749878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4649112820625305, + "epoch": 3.54, + "learning_rate": 3.229923922231615e-05, + "loss": 0.6606, + "step": 4188, + "task_loss": 0.8154797554016113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5395984649658203, + "epoch": 3.54, + "learning_rate": 3.229501267962807e-05, + "loss": 0.6846, + "step": 4189, + "task_loss": 0.9181526303291321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6296202540397644, + "epoch": 3.54, + "learning_rate": 3.229078613693998e-05, + "loss": 0.6547, + "step": 4190, + "task_loss": 0.6031090021133423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4456971287727356, + "epoch": 3.54, + "learning_rate": 3.22865595942519e-05, + "loss": 0.4878, + "step": 4191, + "task_loss": 0.14205634593963623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7604584693908691, + "epoch": 3.54, + "learning_rate": 3.2282333051563826e-05, + "loss": 0.7052, + "step": 4192, + "task_loss": 1.1351755857467651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6406766176223755, + "epoch": 3.54, + "learning_rate": 3.2278106508875746e-05, + "loss": 0.6171, + "step": 4193, + "task_loss": 1.1458464860916138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6606093645095825, + "epoch": 3.54, + "learning_rate": 3.227387996618766e-05, + "loss": 0.5583, + "step": 4194, + "task_loss": 0.3046940863132477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48414623737335205, + "epoch": 3.55, + "learning_rate": 3.226965342349958e-05, + "loss": 0.6198, + "step": 4195, + "task_loss": 1.0572696924209595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6292560696601868, + "epoch": 3.55, + "learning_rate": 3.22654268808115e-05, + "loss": 0.5071, + "step": 4196, + "task_loss": 0.6799464821815491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5613049268722534, + "epoch": 3.55, + "learning_rate": 3.226120033812342e-05, + "loss": 0.5547, + "step": 4197, + "task_loss": 0.6906691193580627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3893170654773712, + "epoch": 3.55, + "learning_rate": 3.225697379543534e-05, + "loss": 0.5784, + "step": 4198, + "task_loss": 1.097582459449768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.531936764717102, + "epoch": 3.55, + "learning_rate": 3.225274725274726e-05, + "loss": 0.5647, + "step": 4199, + "task_loss": 0.904606819152832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.70243239402771, + "epoch": 3.55, + "learning_rate": 3.224852071005917e-05, + "loss": 0.6304, + "step": 4200, + "task_loss": 0.8562054634094238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5404973030090332, + "epoch": 3.55, + "learning_rate": 3.224429416737109e-05, + "loss": 0.4272, + "step": 4201, + "task_loss": 0.38080593943595886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9328231811523438, + "epoch": 3.55, + "learning_rate": 3.224006762468301e-05, + "loss": 0.6323, + "step": 4202, + "task_loss": 0.48349064588546753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.568303108215332, + "epoch": 3.55, + "learning_rate": 3.223584108199493e-05, + "loss": 0.5869, + "step": 4203, + "task_loss": 0.361704558134079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7974933385848999, + "epoch": 3.55, + "learning_rate": 3.223161453930685e-05, + "loss": 0.7818, + "step": 4204, + "task_loss": 0.9017975330352783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35864779353141785, + "epoch": 3.55, + "learning_rate": 3.222738799661877e-05, + "loss": 0.5251, + "step": 4205, + "task_loss": 0.5652700662612915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8452371954917908, + "epoch": 3.56, + "learning_rate": 3.222316145393068e-05, + "loss": 0.7821, + "step": 4206, + "task_loss": 0.7905057072639465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49725431203842163, + "epoch": 3.56, + "learning_rate": 3.22189349112426e-05, + "loss": 0.6887, + "step": 4207, + "task_loss": 0.24928267300128937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6178297996520996, + "epoch": 3.56, + "learning_rate": 3.221470836855452e-05, + "loss": 0.5617, + "step": 4208, + "task_loss": 0.4262554347515106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5190914869308472, + "epoch": 3.56, + "learning_rate": 3.221048182586645e-05, + "loss": 0.604, + "step": 4209, + "task_loss": 0.7261759042739868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.473241925239563, + "epoch": 3.56, + "learning_rate": 3.220625528317836e-05, + "loss": 0.6712, + "step": 4210, + "task_loss": 0.2535596787929535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.665992796421051, + "epoch": 3.56, + "learning_rate": 3.220202874049028e-05, + "loss": 0.7544, + "step": 4211, + "task_loss": 0.8691056966781616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7985689043998718, + "epoch": 3.56, + "learning_rate": 3.21978021978022e-05, + "loss": 0.6876, + "step": 4212, + "task_loss": 0.8100976347923279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5654987096786499, + "epoch": 3.56, + "learning_rate": 3.2193575655114114e-05, + "loss": 0.609, + "step": 4213, + "task_loss": 0.7667639851570129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.638790488243103, + "epoch": 3.56, + "learning_rate": 3.218934911242604e-05, + "loss": 0.6383, + "step": 4214, + "task_loss": 1.1682111024856567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7357028722763062, + "epoch": 3.56, + "learning_rate": 3.218512256973796e-05, + "loss": 0.7164, + "step": 4215, + "task_loss": 0.844890832901001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4252817630767822, + "epoch": 3.56, + "learning_rate": 3.218089602704987e-05, + "loss": 0.4598, + "step": 4216, + "task_loss": 0.43803659081459045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47045400738716125, + "epoch": 3.56, + "learning_rate": 3.217666948436179e-05, + "loss": 0.6149, + "step": 4217, + "task_loss": 0.7341237664222717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46930402517318726, + "epoch": 3.57, + "learning_rate": 3.217244294167371e-05, + "loss": 0.7054, + "step": 4218, + "task_loss": 0.5197510719299316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6082526445388794, + "epoch": 3.57, + "learning_rate": 3.216821639898563e-05, + "loss": 0.5322, + "step": 4219, + "task_loss": 0.9213230609893799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5877562761306763, + "epoch": 3.57, + "learning_rate": 3.216398985629755e-05, + "loss": 0.8263, + "step": 4220, + "task_loss": 1.1192339658737183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2452743649482727, + "epoch": 3.57, + "learning_rate": 3.215976331360947e-05, + "loss": 0.5747, + "step": 4221, + "task_loss": 0.06740829348564148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4869162440299988, + "epoch": 3.57, + "learning_rate": 3.215553677092139e-05, + "loss": 0.4561, + "step": 4222, + "task_loss": 0.8501110076904297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7156858444213867, + "epoch": 3.57, + "learning_rate": 3.2151310228233304e-05, + "loss": 0.5241, + "step": 4223, + "task_loss": 0.9623615145683289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9896580576896667, + "epoch": 3.57, + "learning_rate": 3.2147083685545224e-05, + "loss": 0.6587, + "step": 4224, + "task_loss": 1.3983900547027588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6754773855209351, + "epoch": 3.57, + "learning_rate": 3.2142857142857144e-05, + "loss": 0.5931, + "step": 4225, + "task_loss": 1.2624531984329224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.463307648897171, + "epoch": 3.57, + "learning_rate": 3.2138630600169064e-05, + "loss": 0.5911, + "step": 4226, + "task_loss": 1.2462469339370728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4898664951324463, + "epoch": 3.57, + "learning_rate": 3.2134404057480983e-05, + "loss": 0.4909, + "step": 4227, + "task_loss": 0.5271729230880737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6329209804534912, + "epoch": 3.57, + "learning_rate": 3.21301775147929e-05, + "loss": 0.5239, + "step": 4228, + "task_loss": 0.42427557706832886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8216217160224915, + "epoch": 3.57, + "learning_rate": 3.2125950972104816e-05, + "loss": 0.8204, + "step": 4229, + "task_loss": 1.9188570976257324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6262178421020508, + "epoch": 3.58, + "learning_rate": 3.2121724429416736e-05, + "loss": 0.5102, + "step": 4230, + "task_loss": 1.105137825012207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6141501665115356, + "epoch": 3.58, + "learning_rate": 3.211749788672866e-05, + "loss": 0.5768, + "step": 4231, + "task_loss": 0.8140987753868103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3838309049606323, + "epoch": 3.58, + "learning_rate": 3.2113271344040575e-05, + "loss": 0.5679, + "step": 4232, + "task_loss": 0.7179095149040222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5304633378982544, + "epoch": 3.58, + "learning_rate": 3.2109044801352495e-05, + "loss": 0.6251, + "step": 4233, + "task_loss": 0.7526938319206238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3643147945404053, + "epoch": 3.58, + "learning_rate": 3.2104818258664415e-05, + "loss": 0.504, + "step": 4234, + "task_loss": 0.3876301050186157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3744189739227295, + "epoch": 3.58, + "learning_rate": 3.210059171597633e-05, + "loss": 0.658, + "step": 4235, + "task_loss": 0.9470755457878113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34365764260292053, + "epoch": 3.58, + "learning_rate": 3.2096365173288254e-05, + "loss": 0.4737, + "step": 4236, + "task_loss": 0.3356644809246063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8528670072555542, + "epoch": 3.58, + "learning_rate": 3.2092138630600174e-05, + "loss": 0.663, + "step": 4237, + "task_loss": 0.8852180242538452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35245010256767273, + "epoch": 3.58, + "learning_rate": 3.2087912087912094e-05, + "loss": 0.5989, + "step": 4238, + "task_loss": 1.1525589227676392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6179569959640503, + "epoch": 3.58, + "learning_rate": 3.208368554522401e-05, + "loss": 0.5201, + "step": 4239, + "task_loss": 0.2603001296520233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9398728609085083, + "epoch": 3.58, + "learning_rate": 3.2079459002535926e-05, + "loss": 0.6746, + "step": 4240, + "task_loss": 1.5526063442230225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5932520627975464, + "epoch": 3.58, + "learning_rate": 3.2075232459847846e-05, + "loss": 0.6777, + "step": 4241, + "task_loss": 0.7001931071281433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6230915784835815, + "epoch": 3.59, + "learning_rate": 3.2071005917159766e-05, + "loss": 0.7681, + "step": 4242, + "task_loss": 1.1468700170516968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7679072618484497, + "epoch": 3.59, + "learning_rate": 3.2066779374471686e-05, + "loss": 0.6807, + "step": 4243, + "task_loss": 0.41273894906044006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5656412243843079, + "epoch": 3.59, + "learning_rate": 3.2062552831783605e-05, + "loss": 0.5988, + "step": 4244, + "task_loss": 0.8380447626113892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3785839080810547, + "epoch": 3.59, + "learning_rate": 3.205832628909552e-05, + "loss": 0.6497, + "step": 4245, + "task_loss": 0.1459299921989441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4484691023826599, + "epoch": 3.59, + "learning_rate": 3.205409974640744e-05, + "loss": 0.5873, + "step": 4246, + "task_loss": 0.13609357178211212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36376792192459106, + "epoch": 3.59, + "learning_rate": 3.204987320371936e-05, + "loss": 0.5182, + "step": 4247, + "task_loss": 0.6277802586555481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6348499059677124, + "epoch": 3.59, + "learning_rate": 3.204564666103128e-05, + "loss": 0.5093, + "step": 4248, + "task_loss": 0.3349606990814209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5473830699920654, + "epoch": 3.59, + "learning_rate": 3.20414201183432e-05, + "loss": 0.5383, + "step": 4249, + "task_loss": 1.2277259826660156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46672943234443665, + "epoch": 3.59, + "learning_rate": 3.203719357565512e-05, + "loss": 0.457, + "step": 4250, + "task_loss": 0.6045480966567993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7462354898452759, + "epoch": 3.59, + "learning_rate": 3.203296703296704e-05, + "loss": 0.7498, + "step": 4251, + "task_loss": 0.43652093410491943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3947990834712982, + "epoch": 3.59, + "learning_rate": 3.202874049027895e-05, + "loss": 0.4996, + "step": 4252, + "task_loss": 0.5876834392547607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4972449243068695, + "epoch": 3.59, + "learning_rate": 3.202451394759087e-05, + "loss": 0.7479, + "step": 4253, + "task_loss": 2.7319962978363037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2912489175796509, + "epoch": 3.6, + "learning_rate": 3.2020287404902796e-05, + "loss": 0.6889, + "step": 4254, + "task_loss": 0.9310650825500488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4191456437110901, + "epoch": 3.6, + "learning_rate": 3.201606086221471e-05, + "loss": 0.5654, + "step": 4255, + "task_loss": 0.37675347924232483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.3826098442077637, + "epoch": 3.6, + "learning_rate": 3.201183431952663e-05, + "loss": 0.7594, + "step": 4256, + "task_loss": 1.8409432172775269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8128854632377625, + "epoch": 3.6, + "learning_rate": 3.200760777683855e-05, + "loss": 0.5836, + "step": 4257, + "task_loss": 0.6604255437850952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37107521295547485, + "epoch": 3.6, + "learning_rate": 3.200338123415046e-05, + "loss": 0.4756, + "step": 4258, + "task_loss": 0.44623643159866333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6175744533538818, + "epoch": 3.6, + "learning_rate": 3.199915469146239e-05, + "loss": 0.725, + "step": 4259, + "task_loss": 1.1219886541366577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8102858066558838, + "epoch": 3.6, + "learning_rate": 3.199492814877431e-05, + "loss": 0.6319, + "step": 4260, + "task_loss": 0.6226069331169128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42583853006362915, + "epoch": 3.6, + "learning_rate": 3.199070160608622e-05, + "loss": 0.5787, + "step": 4261, + "task_loss": 0.7398431897163391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4259260296821594, + "epoch": 3.6, + "learning_rate": 3.198647506339814e-05, + "loss": 0.6635, + "step": 4262, + "task_loss": 0.5187358856201172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4843616485595703, + "epoch": 3.6, + "learning_rate": 3.198224852071006e-05, + "loss": 0.6515, + "step": 4263, + "task_loss": 0.8118218779563904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30011484026908875, + "epoch": 3.6, + "learning_rate": 3.197802197802198e-05, + "loss": 0.5075, + "step": 4264, + "task_loss": 0.024920698255300522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6588087677955627, + "epoch": 3.6, + "learning_rate": 3.19737954353339e-05, + "loss": 0.543, + "step": 4265, + "task_loss": 0.7919685244560242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49749594926834106, + "epoch": 3.61, + "learning_rate": 3.196956889264582e-05, + "loss": 0.4436, + "step": 4266, + "task_loss": 0.3057381510734558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6445580720901489, + "epoch": 3.61, + "learning_rate": 3.196534234995774e-05, + "loss": 0.5012, + "step": 4267, + "task_loss": 0.8744177222251892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5139703750610352, + "epoch": 3.61, + "learning_rate": 3.196111580726965e-05, + "loss": 0.5621, + "step": 4268, + "task_loss": 1.2391936779022217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.523512065410614, + "epoch": 3.61, + "learning_rate": 3.195688926458157e-05, + "loss": 0.8205, + "step": 4269, + "task_loss": 0.42643898725509644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5993216633796692, + "epoch": 3.61, + "learning_rate": 3.195266272189349e-05, + "loss": 0.5618, + "step": 4270, + "task_loss": 0.3088935315608978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8363982439041138, + "epoch": 3.61, + "learning_rate": 3.194843617920541e-05, + "loss": 0.6916, + "step": 4271, + "task_loss": 2.10528302192688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5748842358589172, + "epoch": 3.61, + "learning_rate": 3.194420963651733e-05, + "loss": 0.6422, + "step": 4272, + "task_loss": 1.529256820678711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5410435199737549, + "epoch": 3.61, + "learning_rate": 3.193998309382925e-05, + "loss": 0.5835, + "step": 4273, + "task_loss": 0.9316344261169434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32317885756492615, + "epoch": 3.61, + "learning_rate": 3.1935756551141164e-05, + "loss": 0.5987, + "step": 4274, + "task_loss": 0.921099066734314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5443952083587646, + "epoch": 3.61, + "learning_rate": 3.1931530008453083e-05, + "loss": 0.5943, + "step": 4275, + "task_loss": 0.8989710211753845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6211710572242737, + "epoch": 3.61, + "learning_rate": 3.192730346576501e-05, + "loss": 0.6788, + "step": 4276, + "task_loss": 0.5778294205665588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8003108501434326, + "epoch": 3.61, + "learning_rate": 3.192307692307692e-05, + "loss": 0.6921, + "step": 4277, + "task_loss": 0.7750545740127563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43057748675346375, + "epoch": 3.62, + "learning_rate": 3.191885038038884e-05, + "loss": 0.4917, + "step": 4278, + "task_loss": 0.2805824279785156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5174372792243958, + "epoch": 3.62, + "learning_rate": 3.191462383770076e-05, + "loss": 0.6098, + "step": 4279, + "task_loss": 0.4504948556423187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4709424078464508, + "epoch": 3.62, + "learning_rate": 3.1910397295012675e-05, + "loss": 0.5441, + "step": 4280, + "task_loss": 0.4343849718570709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9384723901748657, + "epoch": 3.62, + "learning_rate": 3.19061707523246e-05, + "loss": 0.6999, + "step": 4281, + "task_loss": 1.215951681137085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6971558332443237, + "epoch": 3.62, + "learning_rate": 3.190194420963652e-05, + "loss": 0.738, + "step": 4282, + "task_loss": 1.10869300365448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29778748750686646, + "epoch": 3.62, + "learning_rate": 3.189771766694844e-05, + "loss": 0.4081, + "step": 4283, + "task_loss": 0.2580684721469879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6616436839103699, + "epoch": 3.62, + "learning_rate": 3.1893491124260354e-05, + "loss": 0.4991, + "step": 4284, + "task_loss": 0.9533717036247253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37196701765060425, + "epoch": 3.62, + "learning_rate": 3.1889264581572274e-05, + "loss": 0.6114, + "step": 4285, + "task_loss": 0.3012705147266388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9313027858734131, + "epoch": 3.62, + "learning_rate": 3.1885038038884194e-05, + "loss": 0.595, + "step": 4286, + "task_loss": 0.8316695690155029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3201221525669098, + "epoch": 3.62, + "learning_rate": 3.1880811496196113e-05, + "loss": 0.4636, + "step": 4287, + "task_loss": 0.3647851049900055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4864259362220764, + "epoch": 3.62, + "learning_rate": 3.187658495350803e-05, + "loss": 0.5722, + "step": 4288, + "task_loss": 1.0231246948242188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7902079820632935, + "epoch": 3.63, + "learning_rate": 3.187235841081995e-05, + "loss": 0.7221, + "step": 4289, + "task_loss": 0.8456388115882874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7198704481124878, + "epoch": 3.63, + "learning_rate": 3.1868131868131866e-05, + "loss": 0.743, + "step": 4290, + "task_loss": 1.1135166883468628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5688356161117554, + "epoch": 3.63, + "learning_rate": 3.1863905325443786e-05, + "loss": 0.8247, + "step": 4291, + "task_loss": 0.44530370831489563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6359323859214783, + "epoch": 3.63, + "learning_rate": 3.1859678782755705e-05, + "loss": 0.6318, + "step": 4292, + "task_loss": 0.7541476488113403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6856552362442017, + "epoch": 3.63, + "learning_rate": 3.1855452240067625e-05, + "loss": 0.7056, + "step": 4293, + "task_loss": 1.386191725730896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7288258075714111, + "epoch": 3.63, + "learning_rate": 3.1851225697379545e-05, + "loss": 0.6303, + "step": 4294, + "task_loss": 1.078108787536621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4438767731189728, + "epoch": 3.63, + "learning_rate": 3.1846999154691465e-05, + "loss": 0.4995, + "step": 4295, + "task_loss": 0.8345937728881836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5319458842277527, + "epoch": 3.63, + "learning_rate": 3.1842772612003384e-05, + "loss": 0.5186, + "step": 4296, + "task_loss": 0.5466291308403015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42933404445648193, + "epoch": 3.63, + "learning_rate": 3.18385460693153e-05, + "loss": 0.4203, + "step": 4297, + "task_loss": 0.537109911441803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5229090452194214, + "epoch": 3.63, + "learning_rate": 3.1834319526627224e-05, + "loss": 0.6674, + "step": 4298, + "task_loss": 1.1762605905532837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41915687918663025, + "epoch": 3.63, + "learning_rate": 3.1830092983939144e-05, + "loss": 0.4691, + "step": 4299, + "task_loss": 0.5037978887557983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4521094560623169, + "epoch": 3.63, + "learning_rate": 3.1825866441251057e-05, + "loss": 0.6811, + "step": 4300, + "task_loss": 1.2478934526443481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5719486474990845, + "epoch": 3.64, + "learning_rate": 3.1821639898562976e-05, + "loss": 0.6389, + "step": 4301, + "task_loss": 0.5874079465866089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5724771022796631, + "epoch": 3.64, + "learning_rate": 3.1817413355874896e-05, + "loss": 0.6631, + "step": 4302, + "task_loss": 0.8944747447967529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6874879002571106, + "epoch": 3.64, + "learning_rate": 3.1813186813186816e-05, + "loss": 0.633, + "step": 4303, + "task_loss": 1.3476438522338867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.721340537071228, + "epoch": 3.64, + "learning_rate": 3.1808960270498735e-05, + "loss": 0.6187, + "step": 4304, + "task_loss": 0.5936524868011475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8770106434822083, + "epoch": 3.64, + "learning_rate": 3.1804733727810655e-05, + "loss": 0.712, + "step": 4305, + "task_loss": 1.4871197938919067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40573886036872864, + "epoch": 3.64, + "learning_rate": 3.180050718512257e-05, + "loss": 0.5775, + "step": 4306, + "task_loss": 0.27883994579315186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45438435673713684, + "epoch": 3.64, + "learning_rate": 3.179628064243449e-05, + "loss": 0.4623, + "step": 4307, + "task_loss": 0.5053079724311829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30068594217300415, + "epoch": 3.64, + "learning_rate": 3.179205409974641e-05, + "loss": 0.5082, + "step": 4308, + "task_loss": 0.220564067363739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41267597675323486, + "epoch": 3.64, + "learning_rate": 3.178782755705833e-05, + "loss": 0.8542, + "step": 4309, + "task_loss": 1.4706257581710815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9790067672729492, + "epoch": 3.64, + "learning_rate": 3.178360101437025e-05, + "loss": 0.7086, + "step": 4310, + "task_loss": 1.2004257440567017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.007163405418396, + "epoch": 3.64, + "learning_rate": 3.177937447168217e-05, + "loss": 0.6727, + "step": 4311, + "task_loss": 0.6934151649475098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7979511022567749, + "epoch": 3.64, + "learning_rate": 3.177514792899409e-05, + "loss": 0.6568, + "step": 4312, + "task_loss": 0.7616295218467712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5598003268241882, + "epoch": 3.65, + "learning_rate": 3.1770921386306e-05, + "loss": 0.7707, + "step": 4313, + "task_loss": 0.5135715007781982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41841548681259155, + "epoch": 3.65, + "learning_rate": 3.176669484361792e-05, + "loss": 0.6926, + "step": 4314, + "task_loss": 0.642728328704834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5688259601593018, + "epoch": 3.65, + "learning_rate": 3.1762468300929846e-05, + "loss": 0.5396, + "step": 4315, + "task_loss": 0.8958868980407715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4546813368797302, + "epoch": 3.65, + "learning_rate": 3.175824175824176e-05, + "loss": 0.5595, + "step": 4316, + "task_loss": 0.49206963181495667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6616531610488892, + "epoch": 3.65, + "learning_rate": 3.175401521555368e-05, + "loss": 0.6062, + "step": 4317, + "task_loss": 1.2377668619155884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5755642652511597, + "epoch": 3.65, + "learning_rate": 3.17497886728656e-05, + "loss": 0.6434, + "step": 4318, + "task_loss": 0.6238715648651123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35224103927612305, + "epoch": 3.65, + "learning_rate": 3.174556213017751e-05, + "loss": 0.4863, + "step": 4319, + "task_loss": 0.45886775851249695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7452940940856934, + "epoch": 3.65, + "learning_rate": 3.174133558748944e-05, + "loss": 0.6366, + "step": 4320, + "task_loss": 1.2794350385665894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.301813542842865, + "epoch": 3.65, + "learning_rate": 3.173710904480136e-05, + "loss": 0.6275, + "step": 4321, + "task_loss": 0.7418091297149658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3648691177368164, + "epoch": 3.65, + "learning_rate": 3.173288250211327e-05, + "loss": 0.6712, + "step": 4322, + "task_loss": 0.1716543734073639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8355896472930908, + "epoch": 3.65, + "learning_rate": 3.172865595942519e-05, + "loss": 0.7054, + "step": 4323, + "task_loss": 1.4562668800354004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3395461440086365, + "epoch": 3.65, + "learning_rate": 3.172442941673711e-05, + "loss": 0.582, + "step": 4324, + "task_loss": 0.5881152749061584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2862630784511566, + "epoch": 3.66, + "learning_rate": 3.172020287404903e-05, + "loss": 0.4907, + "step": 4325, + "task_loss": 0.048797111958265305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5867465734481812, + "epoch": 3.66, + "learning_rate": 3.171597633136095e-05, + "loss": 0.5474, + "step": 4326, + "task_loss": 1.4700002670288086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41917672753334045, + "epoch": 3.66, + "learning_rate": 3.171174978867287e-05, + "loss": 0.5455, + "step": 4327, + "task_loss": 0.4365634620189667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45651277899742126, + "epoch": 3.66, + "learning_rate": 3.170752324598479e-05, + "loss": 0.5857, + "step": 4328, + "task_loss": 0.3946208655834198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46523886919021606, + "epoch": 3.66, + "learning_rate": 3.17032967032967e-05, + "loss": 0.5393, + "step": 4329, + "task_loss": 0.3044187128543854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0740121603012085, + "epoch": 3.66, + "learning_rate": 3.169907016060862e-05, + "loss": 0.7448, + "step": 4330, + "task_loss": 1.0956947803497314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.526522696018219, + "epoch": 3.66, + "learning_rate": 3.169484361792054e-05, + "loss": 0.56, + "step": 4331, + "task_loss": 0.28842079639434814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39969658851623535, + "epoch": 3.66, + "learning_rate": 3.169061707523246e-05, + "loss": 0.4289, + "step": 4332, + "task_loss": 0.5743923187255859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9064669609069824, + "epoch": 3.66, + "learning_rate": 3.168639053254438e-05, + "loss": 0.5589, + "step": 4333, + "task_loss": 0.5419033765792847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5803412199020386, + "epoch": 3.66, + "learning_rate": 3.16821639898563e-05, + "loss": 0.5058, + "step": 4334, + "task_loss": 1.0003029108047485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4588584899902344, + "epoch": 3.66, + "learning_rate": 3.1677937447168214e-05, + "loss": 0.7081, + "step": 4335, + "task_loss": 0.5820468068122864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39306706190109253, + "epoch": 3.66, + "learning_rate": 3.167371090448013e-05, + "loss": 0.4372, + "step": 4336, + "task_loss": 0.1411595493555069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3232420086860657, + "epoch": 3.67, + "learning_rate": 3.166948436179206e-05, + "loss": 0.6855, + "step": 4337, + "task_loss": 0.4553565979003906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9395290613174438, + "epoch": 3.67, + "learning_rate": 3.166525781910397e-05, + "loss": 0.7082, + "step": 4338, + "task_loss": 0.8924969434738159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45405763387680054, + "epoch": 3.67, + "learning_rate": 3.166103127641589e-05, + "loss": 0.389, + "step": 4339, + "task_loss": 0.06701885908842087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7120287418365479, + "epoch": 3.67, + "learning_rate": 3.165680473372781e-05, + "loss": 0.749, + "step": 4340, + "task_loss": 0.7082578539848328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4376968741416931, + "epoch": 3.67, + "learning_rate": 3.165257819103973e-05, + "loss": 0.7204, + "step": 4341, + "task_loss": 0.7110418677330017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3074084520339966, + "epoch": 3.67, + "learning_rate": 3.164835164835165e-05, + "loss": 0.5375, + "step": 4342, + "task_loss": 0.2828150987625122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7828773856163025, + "epoch": 3.67, + "learning_rate": 3.164412510566357e-05, + "loss": 0.6483, + "step": 4343, + "task_loss": 0.7823633551597595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5047447681427002, + "epoch": 3.67, + "learning_rate": 3.163989856297549e-05, + "loss": 0.5744, + "step": 4344, + "task_loss": 1.5857350826263428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2383936047554016, + "epoch": 3.67, + "learning_rate": 3.1635672020287404e-05, + "loss": 0.5245, + "step": 4345, + "task_loss": 0.18443283438682556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.375379353761673, + "epoch": 3.67, + "learning_rate": 3.1631445477599324e-05, + "loss": 0.5501, + "step": 4346, + "task_loss": 0.6462752223014832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43102389574050903, + "epoch": 3.67, + "learning_rate": 3.1627218934911244e-05, + "loss": 0.4801, + "step": 4347, + "task_loss": 0.4378305673599243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6170026659965515, + "epoch": 3.67, + "learning_rate": 3.162299239222316e-05, + "loss": 0.7544, + "step": 4348, + "task_loss": 0.24095311760902405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9279892444610596, + "epoch": 3.68, + "learning_rate": 3.161876584953508e-05, + "loss": 0.6585, + "step": 4349, + "task_loss": 1.3238933086395264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9275033473968506, + "epoch": 3.68, + "learning_rate": 3.1614539306847e-05, + "loss": 0.6037, + "step": 4350, + "task_loss": 1.1025723218917847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7628381252288818, + "epoch": 3.68, + "learning_rate": 3.1610312764158916e-05, + "loss": 0.6579, + "step": 4351, + "task_loss": 0.606580913066864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9433069229125977, + "epoch": 3.68, + "learning_rate": 3.1606086221470836e-05, + "loss": 0.8221, + "step": 4352, + "task_loss": 0.8019368052482605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7382798790931702, + "epoch": 3.68, + "learning_rate": 3.1601859678782755e-05, + "loss": 0.8307, + "step": 4353, + "task_loss": 1.1192035675048828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.545191764831543, + "epoch": 3.68, + "learning_rate": 3.159763313609468e-05, + "loss": 0.8978, + "step": 4354, + "task_loss": 1.6121832132339478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6115432977676392, + "epoch": 3.68, + "learning_rate": 3.1593406593406595e-05, + "loss": 0.7274, + "step": 4355, + "task_loss": 1.1889289617538452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40386080741882324, + "epoch": 3.68, + "learning_rate": 3.1589180050718514e-05, + "loss": 0.597, + "step": 4356, + "task_loss": 1.165809154510498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5622716546058655, + "epoch": 3.68, + "learning_rate": 3.1584953508030434e-05, + "loss": 0.5775, + "step": 4357, + "task_loss": 0.801180362701416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.625637412071228, + "epoch": 3.68, + "learning_rate": 3.158072696534235e-05, + "loss": 0.5286, + "step": 4358, + "task_loss": 0.6454900503158569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3796696066856384, + "epoch": 3.68, + "learning_rate": 3.1576500422654274e-05, + "loss": 0.5803, + "step": 4359, + "task_loss": 0.852182924747467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.580944299697876, + "epoch": 3.69, + "learning_rate": 3.1572273879966193e-05, + "loss": 0.5197, + "step": 4360, + "task_loss": 0.41458451747894287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48113706707954407, + "epoch": 3.69, + "learning_rate": 3.1568047337278106e-05, + "loss": 0.6058, + "step": 4361, + "task_loss": 0.5011580586433411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4710251986980438, + "epoch": 3.69, + "learning_rate": 3.1563820794590026e-05, + "loss": 0.652, + "step": 4362, + "task_loss": 0.9010404348373413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6202372312545776, + "epoch": 3.69, + "learning_rate": 3.1559594251901946e-05, + "loss": 0.5504, + "step": 4363, + "task_loss": 0.9609682559967041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.86912602186203, + "epoch": 3.69, + "learning_rate": 3.155536770921386e-05, + "loss": 0.5964, + "step": 4364, + "task_loss": 0.5776647925376892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.114647626876831, + "epoch": 3.69, + "learning_rate": 3.1551141166525785e-05, + "loss": 0.7971, + "step": 4365, + "task_loss": 0.8565689921379089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49282217025756836, + "epoch": 3.69, + "learning_rate": 3.1546914623837705e-05, + "loss": 0.6618, + "step": 4366, + "task_loss": 1.132073163986206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0035711526870728, + "epoch": 3.69, + "learning_rate": 3.154268808114962e-05, + "loss": 0.8887, + "step": 4367, + "task_loss": 1.2741198539733887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6079678535461426, + "epoch": 3.69, + "learning_rate": 3.153846153846154e-05, + "loss": 0.5419, + "step": 4368, + "task_loss": 0.9336712956428528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.450817346572876, + "epoch": 3.69, + "learning_rate": 3.153423499577346e-05, + "loss": 0.4887, + "step": 4369, + "task_loss": 0.6696841716766357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40493929386138916, + "epoch": 3.69, + "learning_rate": 3.153000845308538e-05, + "loss": 0.6124, + "step": 4370, + "task_loss": 0.5956547260284424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7713926434516907, + "epoch": 3.69, + "learning_rate": 3.15257819103973e-05, + "loss": 0.6891, + "step": 4371, + "task_loss": 0.187714621424675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4834434390068054, + "epoch": 3.7, + "learning_rate": 3.152155536770922e-05, + "loss": 0.4901, + "step": 4372, + "task_loss": 0.9323393702507019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1184810400009155, + "epoch": 3.7, + "learning_rate": 3.1517328825021136e-05, + "loss": 0.607, + "step": 4373, + "task_loss": 0.9886298179626465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4940279424190521, + "epoch": 3.7, + "learning_rate": 3.151310228233305e-05, + "loss": 0.5571, + "step": 4374, + "task_loss": 0.1513417661190033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5817437171936035, + "epoch": 3.7, + "learning_rate": 3.150887573964497e-05, + "loss": 0.6751, + "step": 4375, + "task_loss": 1.9388114213943481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3639531135559082, + "epoch": 3.7, + "learning_rate": 3.1504649196956896e-05, + "loss": 0.6742, + "step": 4376, + "task_loss": 0.45520275831222534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40285754203796387, + "epoch": 3.7, + "learning_rate": 3.150042265426881e-05, + "loss": 0.5499, + "step": 4377, + "task_loss": 0.9072644114494324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3399021029472351, + "epoch": 3.7, + "learning_rate": 3.149619611158073e-05, + "loss": 0.7381, + "step": 4378, + "task_loss": 0.817537248134613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.465384304523468, + "epoch": 3.7, + "learning_rate": 3.149196956889265e-05, + "loss": 0.5999, + "step": 4379, + "task_loss": 0.26480424404144287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.560734212398529, + "epoch": 3.7, + "learning_rate": 3.148774302620456e-05, + "loss": 0.7969, + "step": 4380, + "task_loss": 0.5340701937675476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3710169196128845, + "epoch": 3.7, + "learning_rate": 3.148351648351648e-05, + "loss": 0.6207, + "step": 4381, + "task_loss": 0.07768706232309341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3791731595993042, + "epoch": 3.7, + "learning_rate": 3.147928994082841e-05, + "loss": 0.6673, + "step": 4382, + "task_loss": 0.15831103920936584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7198534607887268, + "epoch": 3.7, + "learning_rate": 3.147506339814033e-05, + "loss": 0.7723, + "step": 4383, + "task_loss": 0.12946297228336334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8525928258895874, + "epoch": 3.71, + "learning_rate": 3.147083685545224e-05, + "loss": 0.6921, + "step": 4384, + "task_loss": 0.4998167157173157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8171555995941162, + "epoch": 3.71, + "learning_rate": 3.146661031276416e-05, + "loss": 0.6871, + "step": 4385, + "task_loss": 1.2481720447540283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7714055180549622, + "epoch": 3.71, + "learning_rate": 3.146238377007608e-05, + "loss": 0.5675, + "step": 4386, + "task_loss": 0.1793496161699295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5537706613540649, + "epoch": 3.71, + "learning_rate": 3.1458157227388e-05, + "loss": 0.629, + "step": 4387, + "task_loss": 1.3294854164123535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6943092346191406, + "epoch": 3.71, + "learning_rate": 3.145393068469992e-05, + "loss": 0.6324, + "step": 4388, + "task_loss": 0.33173462748527527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7647502422332764, + "epoch": 3.71, + "learning_rate": 3.144970414201184e-05, + "loss": 0.6857, + "step": 4389, + "task_loss": 1.4498182535171509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28095296025276184, + "epoch": 3.71, + "learning_rate": 3.144547759932375e-05, + "loss": 0.4888, + "step": 4390, + "task_loss": 0.1757201850414276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.285271555185318, + "epoch": 3.71, + "learning_rate": 3.144125105663567e-05, + "loss": 0.6081, + "step": 4391, + "task_loss": 1.0653576850891113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5226113796234131, + "epoch": 3.71, + "learning_rate": 3.143702451394759e-05, + "loss": 0.6865, + "step": 4392, + "task_loss": 0.9720564484596252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6892796158790588, + "epoch": 3.71, + "learning_rate": 3.143279797125951e-05, + "loss": 0.7721, + "step": 4393, + "task_loss": 0.23807276785373688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6205359101295471, + "epoch": 3.71, + "learning_rate": 3.142857142857143e-05, + "loss": 0.6149, + "step": 4394, + "task_loss": 1.3027851581573486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4892859160900116, + "epoch": 3.71, + "learning_rate": 3.142434488588335e-05, + "loss": 0.6674, + "step": 4395, + "task_loss": 0.780527651309967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28657805919647217, + "epoch": 3.72, + "learning_rate": 3.142011834319526e-05, + "loss": 0.562, + "step": 4396, + "task_loss": 0.3991193473339081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7082350254058838, + "epoch": 3.72, + "learning_rate": 3.141589180050718e-05, + "loss": 0.5793, + "step": 4397, + "task_loss": 0.8086472153663635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9290770292282104, + "epoch": 3.72, + "learning_rate": 3.14116652578191e-05, + "loss": 0.6864, + "step": 4398, + "task_loss": 0.8588287234306335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0052218437194824, + "epoch": 3.72, + "learning_rate": 3.140743871513103e-05, + "loss": 0.7908, + "step": 4399, + "task_loss": 1.4899024963378906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5374901294708252, + "epoch": 3.72, + "learning_rate": 3.140321217244294e-05, + "loss": 0.5116, + "step": 4400, + "task_loss": 0.24133923649787903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5099197030067444, + "epoch": 3.72, + "learning_rate": 3.139898562975486e-05, + "loss": 0.6579, + "step": 4401, + "task_loss": 1.5236730575561523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3611644208431244, + "epoch": 3.72, + "learning_rate": 3.139475908706678e-05, + "loss": 0.6015, + "step": 4402, + "task_loss": 0.6340007185935974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40776538848876953, + "epoch": 3.72, + "learning_rate": 3.1390532544378695e-05, + "loss": 0.5825, + "step": 4403, + "task_loss": 0.5617125034332275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5205349922180176, + "epoch": 3.72, + "learning_rate": 3.138630600169062e-05, + "loss": 0.6563, + "step": 4404, + "task_loss": 0.34392574429512024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5175126791000366, + "epoch": 3.72, + "learning_rate": 3.138207945900254e-05, + "loss": 0.5221, + "step": 4405, + "task_loss": 0.28904277086257935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.067955732345581, + "epoch": 3.72, + "learning_rate": 3.1377852916314454e-05, + "loss": 0.8143, + "step": 4406, + "task_loss": 1.4027005434036255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2872017025947571, + "epoch": 3.72, + "learning_rate": 3.1373626373626374e-05, + "loss": 0.6213, + "step": 4407, + "task_loss": 0.19852782785892487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5442214608192444, + "epoch": 3.73, + "learning_rate": 3.1369399830938293e-05, + "loss": 0.6758, + "step": 4408, + "task_loss": 0.3480748236179352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5705583095550537, + "epoch": 3.73, + "learning_rate": 3.136517328825021e-05, + "loss": 0.4024, + "step": 4409, + "task_loss": 1.0452461242675781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3976019322872162, + "epoch": 3.73, + "learning_rate": 3.136094674556213e-05, + "loss": 0.8186, + "step": 4410, + "task_loss": 0.7618409395217896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6189587116241455, + "epoch": 3.73, + "learning_rate": 3.135672020287405e-05, + "loss": 0.8592, + "step": 4411, + "task_loss": 0.8902673125267029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8908120393753052, + "epoch": 3.73, + "learning_rate": 3.135249366018597e-05, + "loss": 0.6737, + "step": 4412, + "task_loss": 0.8862276673316956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5203185081481934, + "epoch": 3.73, + "learning_rate": 3.1348267117497885e-05, + "loss": 0.4746, + "step": 4413, + "task_loss": 1.1344376802444458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6784696578979492, + "epoch": 3.73, + "learning_rate": 3.1344040574809805e-05, + "loss": 0.6913, + "step": 4414, + "task_loss": 1.3508683443069458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7924786806106567, + "epoch": 3.73, + "learning_rate": 3.1339814032121725e-05, + "loss": 0.607, + "step": 4415, + "task_loss": 0.5525851249694824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1014690399169922, + "epoch": 3.73, + "learning_rate": 3.1335587489433645e-05, + "loss": 0.6748, + "step": 4416, + "task_loss": 0.7910175919532776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6909679174423218, + "epoch": 3.73, + "learning_rate": 3.1331360946745564e-05, + "loss": 0.6901, + "step": 4417, + "task_loss": 1.2508430480957031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3092275559902191, + "epoch": 3.73, + "learning_rate": 3.1327134404057484e-05, + "loss": 0.5479, + "step": 4418, + "task_loss": 0.1661626249551773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44045236706733704, + "epoch": 3.73, + "learning_rate": 3.13229078613694e-05, + "loss": 0.4972, + "step": 4419, + "task_loss": 0.5092931389808655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6696663498878479, + "epoch": 3.74, + "learning_rate": 3.131868131868132e-05, + "loss": 0.5663, + "step": 4420, + "task_loss": 0.7401490211486816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8323312401771545, + "epoch": 3.74, + "learning_rate": 3.131445477599324e-05, + "loss": 0.5374, + "step": 4421, + "task_loss": 1.1042890548706055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6723310947418213, + "epoch": 3.74, + "learning_rate": 3.1310228233305156e-05, + "loss": 0.63, + "step": 4422, + "task_loss": 1.2064276933670044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4610983729362488, + "epoch": 3.74, + "learning_rate": 3.1306001690617076e-05, + "loss": 0.5279, + "step": 4423, + "task_loss": 1.6228435039520264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4889376759529114, + "epoch": 3.74, + "learning_rate": 3.1301775147928996e-05, + "loss": 0.5157, + "step": 4424, + "task_loss": 0.6824313998222351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7890335917472839, + "epoch": 3.74, + "learning_rate": 3.129754860524091e-05, + "loss": 0.6034, + "step": 4425, + "task_loss": 1.9725552797317505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4488603174686432, + "epoch": 3.74, + "learning_rate": 3.1293322062552835e-05, + "loss": 0.5131, + "step": 4426, + "task_loss": 0.3517454266548157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5910731554031372, + "epoch": 3.74, + "learning_rate": 3.1289095519864755e-05, + "loss": 0.4865, + "step": 4427, + "task_loss": 0.8392993211746216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6440855264663696, + "epoch": 3.74, + "learning_rate": 3.1284868977176675e-05, + "loss": 0.7271, + "step": 4428, + "task_loss": 0.8623766303062439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6308737993240356, + "epoch": 3.74, + "learning_rate": 3.128064243448859e-05, + "loss": 0.5544, + "step": 4429, + "task_loss": 0.3763124942779541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8419972062110901, + "epoch": 3.74, + "learning_rate": 3.127641589180051e-05, + "loss": 0.8696, + "step": 4430, + "task_loss": 1.134678840637207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0648462772369385, + "epoch": 3.75, + "learning_rate": 3.127218934911243e-05, + "loss": 0.7891, + "step": 4431, + "task_loss": 1.5075888633728027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4411104619503021, + "epoch": 3.75, + "learning_rate": 3.126796280642435e-05, + "loss": 0.5469, + "step": 4432, + "task_loss": 0.7082804441452026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8028160333633423, + "epoch": 3.75, + "learning_rate": 3.1263736263736267e-05, + "loss": 0.7598, + "step": 4433, + "task_loss": 0.6093019843101501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5423300862312317, + "epoch": 3.75, + "learning_rate": 3.1259509721048186e-05, + "loss": 0.564, + "step": 4434, + "task_loss": 0.3890649080276489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7079875469207764, + "epoch": 3.75, + "learning_rate": 3.12552831783601e-05, + "loss": 0.6192, + "step": 4435, + "task_loss": 0.4845954179763794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5152788162231445, + "epoch": 3.75, + "learning_rate": 3.125105663567202e-05, + "loss": 0.6525, + "step": 4436, + "task_loss": 0.7137448191642761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38127589225769043, + "epoch": 3.75, + "learning_rate": 3.124683009298394e-05, + "loss": 0.5073, + "step": 4437, + "task_loss": 0.3960942029953003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.919884204864502, + "epoch": 3.75, + "learning_rate": 3.124260355029586e-05, + "loss": 0.5951, + "step": 4438, + "task_loss": 0.6805700659751892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36860549449920654, + "epoch": 3.75, + "learning_rate": 3.123837700760778e-05, + "loss": 0.5476, + "step": 4439, + "task_loss": 0.31832200288772583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7535417675971985, + "epoch": 3.75, + "learning_rate": 3.12341504649197e-05, + "loss": 0.6445, + "step": 4440, + "task_loss": 1.4454289674758911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8517183065414429, + "epoch": 3.75, + "learning_rate": 3.122992392223162e-05, + "loss": 0.9319, + "step": 4441, + "task_loss": 0.8230589628219604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5334685444831848, + "epoch": 3.75, + "learning_rate": 3.122569737954353e-05, + "loss": 0.6219, + "step": 4442, + "task_loss": 0.42743802070617676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7406021952629089, + "epoch": 3.76, + "learning_rate": 3.122147083685546e-05, + "loss": 0.6814, + "step": 4443, + "task_loss": 0.5760184526443481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4810228943824768, + "epoch": 3.76, + "learning_rate": 3.121724429416738e-05, + "loss": 0.7459, + "step": 4444, + "task_loss": 1.1669907569885254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9073999524116516, + "epoch": 3.76, + "learning_rate": 3.121301775147929e-05, + "loss": 0.5434, + "step": 4445, + "task_loss": 0.8882380723953247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4339657425880432, + "epoch": 3.76, + "learning_rate": 3.120879120879121e-05, + "loss": 0.6492, + "step": 4446, + "task_loss": 0.2799747586250305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.941970705986023, + "epoch": 3.76, + "learning_rate": 3.120456466610313e-05, + "loss": 0.7707, + "step": 4447, + "task_loss": 0.6933077573776245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7601891756057739, + "epoch": 3.76, + "learning_rate": 3.120033812341505e-05, + "loss": 0.678, + "step": 4448, + "task_loss": 1.7052204608917236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.188031405210495, + "epoch": 3.76, + "learning_rate": 3.119611158072697e-05, + "loss": 0.4625, + "step": 4449, + "task_loss": 0.014701290987432003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7593672275543213, + "epoch": 3.76, + "learning_rate": 3.119188503803889e-05, + "loss": 0.657, + "step": 4450, + "task_loss": 1.521148920059204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6656076908111572, + "epoch": 3.76, + "learning_rate": 3.11876584953508e-05, + "loss": 0.6235, + "step": 4451, + "task_loss": 0.9446977376937866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1436831951141357, + "epoch": 3.76, + "learning_rate": 3.118343195266272e-05, + "loss": 0.6096, + "step": 4452, + "task_loss": 0.6983094811439514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9589691162109375, + "epoch": 3.76, + "learning_rate": 3.117920540997464e-05, + "loss": 0.5742, + "step": 4453, + "task_loss": 0.5564941167831421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4245166480541229, + "epoch": 3.76, + "learning_rate": 3.117497886728656e-05, + "loss": 0.3938, + "step": 4454, + "task_loss": 0.5738080739974976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7556803226470947, + "epoch": 3.77, + "learning_rate": 3.117075232459848e-05, + "loss": 0.659, + "step": 4455, + "task_loss": 0.25264406204223633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.588709831237793, + "epoch": 3.77, + "learning_rate": 3.11665257819104e-05, + "loss": 0.6914, + "step": 4456, + "task_loss": 0.8855775594711304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6776687502861023, + "epoch": 3.77, + "learning_rate": 3.116229923922232e-05, + "loss": 0.4626, + "step": 4457, + "task_loss": 0.5914424657821655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48696398735046387, + "epoch": 3.77, + "learning_rate": 3.115807269653423e-05, + "loss": 0.7374, + "step": 4458, + "task_loss": 0.5321323871612549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2600225806236267, + "epoch": 3.77, + "learning_rate": 3.115384615384615e-05, + "loss": 0.484, + "step": 4459, + "task_loss": 0.46346914768218994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4186503291130066, + "epoch": 3.77, + "learning_rate": 3.114961961115808e-05, + "loss": 0.4702, + "step": 4460, + "task_loss": 0.8028692007064819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4472455084323883, + "epoch": 3.77, + "learning_rate": 3.114539306846999e-05, + "loss": 0.5175, + "step": 4461, + "task_loss": 0.5751044154167175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3327191472053528, + "epoch": 3.77, + "learning_rate": 3.114116652578191e-05, + "loss": 0.5316, + "step": 4462, + "task_loss": 0.2516881227493286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9260343313217163, + "epoch": 3.77, + "learning_rate": 3.113693998309383e-05, + "loss": 0.676, + "step": 4463, + "task_loss": 0.5999809503555298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6782312989234924, + "epoch": 3.77, + "learning_rate": 3.1132713440405745e-05, + "loss": 0.752, + "step": 4464, + "task_loss": 1.2817986011505127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6236807703971863, + "epoch": 3.77, + "learning_rate": 3.112848689771767e-05, + "loss": 0.7073, + "step": 4465, + "task_loss": 0.9672214984893799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5089489221572876, + "epoch": 3.77, + "learning_rate": 3.112426035502959e-05, + "loss": 0.5038, + "step": 4466, + "task_loss": 0.08270702511072159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.432626873254776, + "epoch": 3.78, + "learning_rate": 3.1120033812341504e-05, + "loss": 0.5698, + "step": 4467, + "task_loss": 0.45906898379325867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4977017939090729, + "epoch": 3.78, + "learning_rate": 3.1115807269653424e-05, + "loss": 0.5934, + "step": 4468, + "task_loss": 0.29010775685310364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1058757305145264, + "epoch": 3.78, + "learning_rate": 3.111158072696534e-05, + "loss": 0.7239, + "step": 4469, + "task_loss": 2.229424476623535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5502418279647827, + "epoch": 3.78, + "learning_rate": 3.110735418427726e-05, + "loss": 0.5335, + "step": 4470, + "task_loss": 0.08442430943250656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34887951612472534, + "epoch": 3.78, + "learning_rate": 3.110312764158918e-05, + "loss": 0.5992, + "step": 4471, + "task_loss": 0.4855583608150482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6222760081291199, + "epoch": 3.78, + "learning_rate": 3.10989010989011e-05, + "loss": 0.6416, + "step": 4472, + "task_loss": 0.48976996541023254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34236112236976624, + "epoch": 3.78, + "learning_rate": 3.109467455621302e-05, + "loss": 0.4734, + "step": 4473, + "task_loss": 0.497106671333313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47770261764526367, + "epoch": 3.78, + "learning_rate": 3.1090448013524935e-05, + "loss": 0.4986, + "step": 4474, + "task_loss": 0.6153969168663025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.685827374458313, + "epoch": 3.78, + "learning_rate": 3.1086221470836855e-05, + "loss": 0.5499, + "step": 4475, + "task_loss": 0.11037730425596237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29985013604164124, + "epoch": 3.78, + "learning_rate": 3.1081994928148775e-05, + "loss": 0.6297, + "step": 4476, + "task_loss": 0.8097463846206665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7170051336288452, + "epoch": 3.78, + "learning_rate": 3.1077768385460694e-05, + "loss": 0.6078, + "step": 4477, + "task_loss": 0.8717119693756104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7694337368011475, + "epoch": 3.78, + "learning_rate": 3.1073541842772614e-05, + "loss": 0.6077, + "step": 4478, + "task_loss": 1.08436918258667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3849923610687256, + "epoch": 3.79, + "learning_rate": 3.1069315300084534e-05, + "loss": 0.4593, + "step": 4479, + "task_loss": 0.4722740650177002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5677958726882935, + "epoch": 3.79, + "learning_rate": 3.106508875739645e-05, + "loss": 0.5593, + "step": 4480, + "task_loss": 0.15746158361434937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7945002317428589, + "epoch": 3.79, + "learning_rate": 3.1060862214708367e-05, + "loss": 0.608, + "step": 4481, + "task_loss": 0.7113239169120789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5034031867980957, + "epoch": 3.79, + "learning_rate": 3.105663567202029e-05, + "loss": 0.5462, + "step": 4482, + "task_loss": 1.173659086227417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36249810457229614, + "epoch": 3.79, + "learning_rate": 3.1052409129332206e-05, + "loss": 0.4536, + "step": 4483, + "task_loss": 0.055759161710739136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5031832456588745, + "epoch": 3.79, + "learning_rate": 3.1048182586644126e-05, + "loss": 0.5148, + "step": 4484, + "task_loss": 0.9738038778305054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5841338634490967, + "epoch": 3.79, + "learning_rate": 3.1043956043956046e-05, + "loss": 0.6619, + "step": 4485, + "task_loss": 0.8691191673278809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7698627710342407, + "epoch": 3.79, + "learning_rate": 3.1039729501267965e-05, + "loss": 0.5726, + "step": 4486, + "task_loss": 1.609784483909607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7682676315307617, + "epoch": 3.79, + "learning_rate": 3.1035502958579885e-05, + "loss": 0.5144, + "step": 4487, + "task_loss": 1.391501545906067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7411738038063049, + "epoch": 3.79, + "learning_rate": 3.1031276415891805e-05, + "loss": 0.6006, + "step": 4488, + "task_loss": 1.196273684501648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4277286231517792, + "epoch": 3.79, + "learning_rate": 3.1027049873203725e-05, + "loss": 0.5148, + "step": 4489, + "task_loss": 0.6398351788520813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4339134097099304, + "epoch": 3.79, + "learning_rate": 3.102282333051564e-05, + "loss": 0.5867, + "step": 4490, + "task_loss": 0.987057626247406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39959561824798584, + "epoch": 3.8, + "learning_rate": 3.101859678782756e-05, + "loss": 0.4816, + "step": 4491, + "task_loss": 0.6280518770217896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7162389755249023, + "epoch": 3.8, + "learning_rate": 3.101437024513948e-05, + "loss": 0.7001, + "step": 4492, + "task_loss": 1.1401653289794922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7851250767707825, + "epoch": 3.8, + "learning_rate": 3.10101437024514e-05, + "loss": 0.6459, + "step": 4493, + "task_loss": 1.1911137104034424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.635193407535553, + "epoch": 3.8, + "learning_rate": 3.1005917159763316e-05, + "loss": 0.6207, + "step": 4494, + "task_loss": 0.6690196394920349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8584632873535156, + "epoch": 3.8, + "learning_rate": 3.1001690617075236e-05, + "loss": 0.5074, + "step": 4495, + "task_loss": 0.7543220520019531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8067877292633057, + "epoch": 3.8, + "learning_rate": 3.099746407438715e-05, + "loss": 0.6872, + "step": 4496, + "task_loss": 0.7726377248764038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5504999160766602, + "epoch": 3.8, + "learning_rate": 3.099323753169907e-05, + "loss": 0.5616, + "step": 4497, + "task_loss": 0.6769705414772034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.522042453289032, + "epoch": 3.8, + "learning_rate": 3.098901098901099e-05, + "loss": 0.587, + "step": 4498, + "task_loss": 0.2908691465854645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43705064058303833, + "epoch": 3.8, + "learning_rate": 3.0984784446322915e-05, + "loss": 0.6191, + "step": 4499, + "task_loss": 0.2567892074584961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5280297994613647, + "epoch": 3.8, + "learning_rate": 3.098055790363483e-05, + "loss": 0.6967, + "step": 4500, + "task_loss": 1.2133866548538208 + }, + { + "epoch": 3.8, + "eval_accuracy": 0.8985742574257426, + "eval_loss": 0.38136711716651917, + "eval_runtime": 227.6852, + "eval_samples_per_second": 110.899, + "eval_steps_per_second": 0.87, + "step": 4500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8794289827346802, + "epoch": 3.8, + "learning_rate": 3.097633136094675e-05, + "loss": 0.7024, + "step": 4501, + "task_loss": 0.5016676783561707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5788029432296753, + "epoch": 3.81, + "learning_rate": 3.097210481825867e-05, + "loss": 0.5417, + "step": 4502, + "task_loss": 0.4744543433189392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42210519313812256, + "epoch": 3.81, + "learning_rate": 3.096787827557058e-05, + "loss": 0.4941, + "step": 4503, + "task_loss": 0.48587310314178467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4772506058216095, + "epoch": 3.81, + "learning_rate": 3.096365173288251e-05, + "loss": 0.5663, + "step": 4504, + "task_loss": 0.3254431486129761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44926780462265015, + "epoch": 3.81, + "learning_rate": 3.095942519019443e-05, + "loss": 0.6775, + "step": 4505, + "task_loss": 0.721937358379364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6382407546043396, + "epoch": 3.81, + "learning_rate": 3.095519864750634e-05, + "loss": 0.5624, + "step": 4506, + "task_loss": 1.164475679397583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7414417862892151, + "epoch": 3.81, + "learning_rate": 3.095097210481826e-05, + "loss": 0.5192, + "step": 4507, + "task_loss": 1.4476611614227295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3845599889755249, + "epoch": 3.81, + "learning_rate": 3.094674556213018e-05, + "loss": 0.5151, + "step": 4508, + "task_loss": 0.7954856157302856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1481575965881348, + "epoch": 3.81, + "learning_rate": 3.094251901944209e-05, + "loss": 0.6428, + "step": 4509, + "task_loss": 0.6410477161407471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9641256332397461, + "epoch": 3.81, + "learning_rate": 3.093829247675402e-05, + "loss": 0.6298, + "step": 4510, + "task_loss": 1.6068434715270996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0639525651931763, + "epoch": 3.81, + "learning_rate": 3.093406593406594e-05, + "loss": 0.7856, + "step": 4511, + "task_loss": 0.5756399631500244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8429574966430664, + "epoch": 3.81, + "learning_rate": 3.092983939137785e-05, + "loss": 0.5936, + "step": 4512, + "task_loss": 0.6623560190200806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34575480222702026, + "epoch": 3.81, + "learning_rate": 3.092561284868977e-05, + "loss": 0.5662, + "step": 4513, + "task_loss": 0.52729332447052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5963305234909058, + "epoch": 3.82, + "learning_rate": 3.092138630600169e-05, + "loss": 0.6183, + "step": 4514, + "task_loss": 0.3971259593963623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.475811630487442, + "epoch": 3.82, + "learning_rate": 3.091715976331361e-05, + "loss": 0.5257, + "step": 4515, + "task_loss": 0.9267137050628662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4244786202907562, + "epoch": 3.82, + "learning_rate": 3.091293322062553e-05, + "loss": 0.4573, + "step": 4516, + "task_loss": 0.19998353719711304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5385617017745972, + "epoch": 3.82, + "learning_rate": 3.090870667793745e-05, + "loss": 0.6072, + "step": 4517, + "task_loss": 1.0044212341308594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3737107217311859, + "epoch": 3.82, + "learning_rate": 3.090448013524937e-05, + "loss": 0.5639, + "step": 4518, + "task_loss": 0.7181384563446045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4733601212501526, + "epoch": 3.82, + "learning_rate": 3.090025359256128e-05, + "loss": 0.5285, + "step": 4519, + "task_loss": 0.630943775177002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38003605604171753, + "epoch": 3.82, + "learning_rate": 3.08960270498732e-05, + "loss": 0.3895, + "step": 4520, + "task_loss": 0.33974841237068176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3528686761856079, + "epoch": 3.82, + "learning_rate": 3.089180050718513e-05, + "loss": 0.5676, + "step": 4521, + "task_loss": 1.288833498954773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0590084791183472, + "epoch": 3.82, + "learning_rate": 3.088757396449704e-05, + "loss": 0.6612, + "step": 4522, + "task_loss": 0.3500828146934509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8006274700164795, + "epoch": 3.82, + "learning_rate": 3.088334742180896e-05, + "loss": 0.8013, + "step": 4523, + "task_loss": 0.4747190773487091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6032232046127319, + "epoch": 3.82, + "learning_rate": 3.087912087912088e-05, + "loss": 0.5985, + "step": 4524, + "task_loss": 0.9979947805404663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7694937586784363, + "epoch": 3.82, + "learning_rate": 3.0874894336432794e-05, + "loss": 0.6899, + "step": 4525, + "task_loss": 0.5824013352394104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6039519906044006, + "epoch": 3.83, + "learning_rate": 3.0870667793744714e-05, + "loss": 0.7181, + "step": 4526, + "task_loss": 0.6761059761047363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46193987131118774, + "epoch": 3.83, + "learning_rate": 3.086644125105664e-05, + "loss": 0.6042, + "step": 4527, + "task_loss": 0.5998802781105042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8607094883918762, + "epoch": 3.83, + "learning_rate": 3.086221470836856e-05, + "loss": 0.7503, + "step": 4528, + "task_loss": 1.418262004852295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6125825047492981, + "epoch": 3.83, + "learning_rate": 3.0857988165680473e-05, + "loss": 0.5619, + "step": 4529, + "task_loss": 0.31533950567245483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6773574352264404, + "epoch": 3.83, + "learning_rate": 3.085376162299239e-05, + "loss": 0.7213, + "step": 4530, + "task_loss": 0.7295495867729187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6777825355529785, + "epoch": 3.83, + "learning_rate": 3.084953508030431e-05, + "loss": 0.8211, + "step": 4531, + "task_loss": 1.4275426864624023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44163185358047485, + "epoch": 3.83, + "learning_rate": 3.084530853761623e-05, + "loss": 0.5865, + "step": 4532, + "task_loss": 0.28577885031700134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8128281831741333, + "epoch": 3.83, + "learning_rate": 3.084108199492815e-05, + "loss": 0.609, + "step": 4533, + "task_loss": 0.7846980690956116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5423414707183838, + "epoch": 3.83, + "learning_rate": 3.083685545224007e-05, + "loss": 0.517, + "step": 4534, + "task_loss": 0.36103442311286926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7944339513778687, + "epoch": 3.83, + "learning_rate": 3.0832628909551985e-05, + "loss": 0.7502, + "step": 4535, + "task_loss": 2.01533579826355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5221424102783203, + "epoch": 3.83, + "learning_rate": 3.0828402366863905e-05, + "loss": 0.4301, + "step": 4536, + "task_loss": 0.3656233549118042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5722428560256958, + "epoch": 3.83, + "learning_rate": 3.0824175824175825e-05, + "loss": 0.5431, + "step": 4537, + "task_loss": 0.870442271232605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6565269827842712, + "epoch": 3.84, + "learning_rate": 3.0819949281487744e-05, + "loss": 0.6714, + "step": 4538, + "task_loss": 0.4387611150741577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8450748920440674, + "epoch": 3.84, + "learning_rate": 3.0815722738799664e-05, + "loss": 0.6987, + "step": 4539, + "task_loss": 1.4304630756378174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5021947622299194, + "epoch": 3.84, + "learning_rate": 3.0811496196111584e-05, + "loss": 0.6695, + "step": 4540, + "task_loss": 0.7394357919692993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4328250586986542, + "epoch": 3.84, + "learning_rate": 3.08072696534235e-05, + "loss": 0.4396, + "step": 4541, + "task_loss": 0.6554301977157593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43889665603637695, + "epoch": 3.84, + "learning_rate": 3.0803043110735416e-05, + "loss": 0.5739, + "step": 4542, + "task_loss": 0.18864817917346954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6976079940795898, + "epoch": 3.84, + "learning_rate": 3.0798816568047336e-05, + "loss": 0.5165, + "step": 4543, + "task_loss": 1.2544262409210205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35558581352233887, + "epoch": 3.84, + "learning_rate": 3.079459002535926e-05, + "loss": 0.5898, + "step": 4544, + "task_loss": 0.49586665630340576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5743883848190308, + "epoch": 3.84, + "learning_rate": 3.0790363482671176e-05, + "loss": 0.7453, + "step": 4545, + "task_loss": 0.12546835839748383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6252610683441162, + "epoch": 3.84, + "learning_rate": 3.0786136939983095e-05, + "loss": 0.5556, + "step": 4546, + "task_loss": 1.0304925441741943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.60959792137146, + "epoch": 3.84, + "learning_rate": 3.0781910397295015e-05, + "loss": 0.5557, + "step": 4547, + "task_loss": 0.8270279169082642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6361767649650574, + "epoch": 3.84, + "learning_rate": 3.077768385460693e-05, + "loss": 0.5612, + "step": 4548, + "task_loss": 1.0571876764297485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4474867582321167, + "epoch": 3.84, + "learning_rate": 3.0773457311918855e-05, + "loss": 0.6831, + "step": 4549, + "task_loss": 1.0027767419815063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7085468769073486, + "epoch": 3.85, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.8437, + "step": 4550, + "task_loss": 0.6151509284973145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3118171691894531, + "epoch": 3.85, + "learning_rate": 3.076500422654269e-05, + "loss": 0.5076, + "step": 4551, + "task_loss": 0.4880085289478302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42318493127822876, + "epoch": 3.85, + "learning_rate": 3.076077768385461e-05, + "loss": 0.5083, + "step": 4552, + "task_loss": 0.7853288054466248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4406645596027374, + "epoch": 3.85, + "learning_rate": 3.075655114116653e-05, + "loss": 0.6061, + "step": 4553, + "task_loss": 1.2846165895462036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46522390842437744, + "epoch": 3.85, + "learning_rate": 3.0752324598478447e-05, + "loss": 0.5885, + "step": 4554, + "task_loss": 0.6114926338195801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6775391697883606, + "epoch": 3.85, + "learning_rate": 3.0748098055790366e-05, + "loss": 0.7544, + "step": 4555, + "task_loss": 1.5634747743606567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3962712287902832, + "epoch": 3.85, + "learning_rate": 3.0743871513102286e-05, + "loss": 0.4072, + "step": 4556, + "task_loss": 0.27858734130859375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6714860200881958, + "epoch": 3.85, + "learning_rate": 3.0739644970414206e-05, + "loss": 0.4693, + "step": 4557, + "task_loss": 0.5121512413024902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3502795696258545, + "epoch": 3.85, + "learning_rate": 3.073541842772612e-05, + "loss": 0.5174, + "step": 4558, + "task_loss": 0.18655872344970703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45405086874961853, + "epoch": 3.85, + "learning_rate": 3.073119188503804e-05, + "loss": 0.5134, + "step": 4559, + "task_loss": 0.9049712419509888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6238760352134705, + "epoch": 3.85, + "learning_rate": 3.072696534234996e-05, + "loss": 0.6217, + "step": 4560, + "task_loss": 1.1351268291473389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4105741083621979, + "epoch": 3.85, + "learning_rate": 3.072273879966188e-05, + "loss": 0.4496, + "step": 4561, + "task_loss": 0.3665374517440796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8665479421615601, + "epoch": 3.86, + "learning_rate": 3.07185122569738e-05, + "loss": 0.5432, + "step": 4562, + "task_loss": 0.38219016790390015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46830126643180847, + "epoch": 3.86, + "learning_rate": 3.071428571428572e-05, + "loss": 0.6991, + "step": 4563, + "task_loss": 0.45985695719718933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6550844311714172, + "epoch": 3.86, + "learning_rate": 3.071005917159763e-05, + "loss": 0.4581, + "step": 4564, + "task_loss": 0.3388121426105499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21673999726772308, + "epoch": 3.86, + "learning_rate": 3.070583262890955e-05, + "loss": 0.4623, + "step": 4565, + "task_loss": 0.05698661878705025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6762908697128296, + "epoch": 3.86, + "learning_rate": 3.070160608622148e-05, + "loss": 0.5875, + "step": 4566, + "task_loss": 0.6332026124000549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7190256118774414, + "epoch": 3.86, + "learning_rate": 3.069737954353339e-05, + "loss": 0.6256, + "step": 4567, + "task_loss": 0.641409158706665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5931517481803894, + "epoch": 3.86, + "learning_rate": 3.069315300084531e-05, + "loss": 0.6252, + "step": 4568, + "task_loss": 0.719984233379364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17785121500492096, + "epoch": 3.86, + "learning_rate": 3.068892645815723e-05, + "loss": 0.5192, + "step": 4569, + "task_loss": 0.07080139964818954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6851695775985718, + "epoch": 3.86, + "learning_rate": 3.068469991546914e-05, + "loss": 0.722, + "step": 4570, + "task_loss": 0.7153067588806152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5196282863616943, + "epoch": 3.86, + "learning_rate": 3.068047337278107e-05, + "loss": 0.5594, + "step": 4571, + "task_loss": 0.9176000356674194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28804445266723633, + "epoch": 3.86, + "learning_rate": 3.067624683009299e-05, + "loss": 0.5854, + "step": 4572, + "task_loss": 0.4271126389503479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7349874377250671, + "epoch": 3.87, + "learning_rate": 3.067202028740491e-05, + "loss": 0.5965, + "step": 4573, + "task_loss": 1.055741548538208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37242454290390015, + "epoch": 3.87, + "learning_rate": 3.066779374471682e-05, + "loss": 0.5665, + "step": 4574, + "task_loss": 0.903777003288269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4188491106033325, + "epoch": 3.87, + "learning_rate": 3.066356720202874e-05, + "loss": 0.6317, + "step": 4575, + "task_loss": 0.5148768424987793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37328606843948364, + "epoch": 3.87, + "learning_rate": 3.065934065934066e-05, + "loss": 0.4836, + "step": 4576, + "task_loss": 0.6054843664169312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9590966701507568, + "epoch": 3.87, + "learning_rate": 3.065511411665258e-05, + "loss": 0.6847, + "step": 4577, + "task_loss": 1.2013647556304932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6242749691009521, + "epoch": 3.87, + "learning_rate": 3.06508875739645e-05, + "loss": 0.6302, + "step": 4578, + "task_loss": 0.7615480422973633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8141443729400635, + "epoch": 3.87, + "learning_rate": 3.064666103127642e-05, + "loss": 0.9135, + "step": 4579, + "task_loss": 1.2536354064941406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45828643441200256, + "epoch": 3.87, + "learning_rate": 3.064243448858833e-05, + "loss": 0.6748, + "step": 4580, + "task_loss": 0.5116100311279297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6139363646507263, + "epoch": 3.87, + "learning_rate": 3.063820794590025e-05, + "loss": 0.5701, + "step": 4581, + "task_loss": 0.2785990536212921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4159098267555237, + "epoch": 3.87, + "learning_rate": 3.063398140321217e-05, + "loss": 0.6065, + "step": 4582, + "task_loss": 0.14238207042217255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5898284912109375, + "epoch": 3.87, + "learning_rate": 3.062975486052409e-05, + "loss": 0.5293, + "step": 4583, + "task_loss": 0.5675135254859924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6794606447219849, + "epoch": 3.87, + "learning_rate": 3.062552831783601e-05, + "loss": 0.6289, + "step": 4584, + "task_loss": 0.6555403470993042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.518599271774292, + "epoch": 3.88, + "learning_rate": 3.062130177514793e-05, + "loss": 0.6053, + "step": 4585, + "task_loss": 0.9533485770225525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1301783323287964, + "epoch": 3.88, + "learning_rate": 3.061707523245985e-05, + "loss": 0.6882, + "step": 4586, + "task_loss": 0.8909433484077454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.387397825717926, + "epoch": 3.88, + "learning_rate": 3.0612848689771764e-05, + "loss": 0.6066, + "step": 4587, + "task_loss": 0.02731383591890335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6907158493995667, + "epoch": 3.88, + "learning_rate": 3.060862214708369e-05, + "loss": 0.5385, + "step": 4588, + "task_loss": 0.4953126311302185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7738770842552185, + "epoch": 3.88, + "learning_rate": 3.060439560439561e-05, + "loss": 0.6411, + "step": 4589, + "task_loss": 1.83912193775177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8962497711181641, + "epoch": 3.88, + "learning_rate": 3.060016906170752e-05, + "loss": 0.7956, + "step": 4590, + "task_loss": 1.5886213779449463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5566709041595459, + "epoch": 3.88, + "learning_rate": 3.059594251901944e-05, + "loss": 0.6519, + "step": 4591, + "task_loss": 1.3151274919509888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6951749324798584, + "epoch": 3.88, + "learning_rate": 3.059171597633136e-05, + "loss": 0.6482, + "step": 4592, + "task_loss": 1.7493023872375488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.793886661529541, + "epoch": 3.88, + "learning_rate": 3.058748943364328e-05, + "loss": 0.6073, + "step": 4593, + "task_loss": 1.2700213193893433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.537638783454895, + "epoch": 3.88, + "learning_rate": 3.05832628909552e-05, + "loss": 0.6938, + "step": 4594, + "task_loss": 0.9946596622467041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3894442021846771, + "epoch": 3.88, + "learning_rate": 3.057903634826712e-05, + "loss": 0.6339, + "step": 4595, + "task_loss": 0.8490023612976074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5279010534286499, + "epoch": 3.88, + "learning_rate": 3.0574809805579035e-05, + "loss": 0.6679, + "step": 4596, + "task_loss": 1.493714451789856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8200480937957764, + "epoch": 3.89, + "learning_rate": 3.0570583262890955e-05, + "loss": 0.5328, + "step": 4597, + "task_loss": 0.49615558981895447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6064015626907349, + "epoch": 3.89, + "learning_rate": 3.0566356720202874e-05, + "loss": 0.5919, + "step": 4598, + "task_loss": 0.19688251614570618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8986175060272217, + "epoch": 3.89, + "learning_rate": 3.0562130177514794e-05, + "loss": 0.5528, + "step": 4599, + "task_loss": 0.6584007143974304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45789384841918945, + "epoch": 3.89, + "learning_rate": 3.0557903634826714e-05, + "loss": 0.4642, + "step": 4600, + "task_loss": 0.8988075256347656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6040282249450684, + "epoch": 3.89, + "learning_rate": 3.0553677092138634e-05, + "loss": 0.4742, + "step": 4601, + "task_loss": 0.4458990693092346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5014411211013794, + "epoch": 3.89, + "learning_rate": 3.054945054945055e-05, + "loss": 0.5847, + "step": 4602, + "task_loss": 0.13124622404575348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7907803058624268, + "epoch": 3.89, + "learning_rate": 3.0545224006762466e-05, + "loss": 0.807, + "step": 4603, + "task_loss": 0.2885737717151642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6017202138900757, + "epoch": 3.89, + "learning_rate": 3.0540997464074386e-05, + "loss": 0.5377, + "step": 4604, + "task_loss": 1.2401753664016724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9313364624977112, + "epoch": 3.89, + "learning_rate": 3.053677092138631e-05, + "loss": 0.7706, + "step": 4605, + "task_loss": 1.7111760377883911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5182029008865356, + "epoch": 3.89, + "learning_rate": 3.0532544378698226e-05, + "loss": 0.7943, + "step": 4606, + "task_loss": 0.41629770398139954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4889213442802429, + "epoch": 3.89, + "learning_rate": 3.0528317836010145e-05, + "loss": 0.472, + "step": 4607, + "task_loss": 0.4720189571380615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5702635049819946, + "epoch": 3.89, + "learning_rate": 3.0524091293322065e-05, + "loss": 0.5743, + "step": 4608, + "task_loss": 0.4291757643222809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4841800034046173, + "epoch": 3.9, + "learning_rate": 3.051986475063398e-05, + "loss": 0.5174, + "step": 4609, + "task_loss": 0.578289270401001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4954565763473511, + "epoch": 3.9, + "learning_rate": 3.05156382079459e-05, + "loss": 0.5167, + "step": 4610, + "task_loss": 0.7248358130455017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6871786117553711, + "epoch": 3.9, + "learning_rate": 3.051141166525782e-05, + "loss": 0.5318, + "step": 4611, + "task_loss": 0.7253604531288147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2320185899734497, + "epoch": 3.9, + "learning_rate": 3.0507185122569737e-05, + "loss": 0.7797, + "step": 4612, + "task_loss": 1.1038596630096436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5151013731956482, + "epoch": 3.9, + "learning_rate": 3.0502958579881657e-05, + "loss": 0.5489, + "step": 4613, + "task_loss": 0.7845394611358643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5132116079330444, + "epoch": 3.9, + "learning_rate": 3.0498732037193577e-05, + "loss": 0.6172, + "step": 4614, + "task_loss": 0.6595526933670044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7027263045310974, + "epoch": 3.9, + "learning_rate": 3.04945054945055e-05, + "loss": 0.5035, + "step": 4615, + "task_loss": 0.6024203300476074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4905238747596741, + "epoch": 3.9, + "learning_rate": 3.0490278951817413e-05, + "loss": 0.6528, + "step": 4616, + "task_loss": 0.550947368144989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47260594367980957, + "epoch": 3.9, + "learning_rate": 3.0486052409129336e-05, + "loss": 0.6501, + "step": 4617, + "task_loss": 1.2506953477859497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4075651168823242, + "epoch": 3.9, + "learning_rate": 3.0481825866441256e-05, + "loss": 0.6616, + "step": 4618, + "task_loss": 0.8619855642318726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32177862524986267, + "epoch": 3.9, + "learning_rate": 3.047759932375317e-05, + "loss": 0.6607, + "step": 4619, + "task_loss": 0.8519116640090942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5030864477157593, + "epoch": 3.9, + "learning_rate": 3.047337278106509e-05, + "loss": 0.6319, + "step": 4620, + "task_loss": 1.0712083578109741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6509045362472534, + "epoch": 3.91, + "learning_rate": 3.046914623837701e-05, + "loss": 0.6804, + "step": 4621, + "task_loss": 0.2627386748790741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5919709205627441, + "epoch": 3.91, + "learning_rate": 3.0464919695688924e-05, + "loss": 0.6214, + "step": 4622, + "task_loss": 0.5721436142921448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7036287784576416, + "epoch": 3.91, + "learning_rate": 3.0460693153000848e-05, + "loss": 0.7923, + "step": 4623, + "task_loss": 1.5684539079666138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9535712599754333, + "epoch": 3.91, + "learning_rate": 3.0456466610312767e-05, + "loss": 0.6906, + "step": 4624, + "task_loss": 0.9885371327400208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6447042226791382, + "epoch": 3.91, + "learning_rate": 3.0452240067624684e-05, + "loss": 0.7969, + "step": 4625, + "task_loss": 1.0202895402908325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3470914959907532, + "epoch": 3.91, + "learning_rate": 3.0448013524936603e-05, + "loss": 0.4902, + "step": 4626, + "task_loss": 0.5662506818771362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8621094226837158, + "epoch": 3.91, + "learning_rate": 3.0443786982248523e-05, + "loss": 0.7771, + "step": 4627, + "task_loss": 0.8232499361038208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6573700904846191, + "epoch": 3.91, + "learning_rate": 3.043956043956044e-05, + "loss": 0.5985, + "step": 4628, + "task_loss": 1.3819470405578613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4911420941352844, + "epoch": 3.91, + "learning_rate": 3.043533389687236e-05, + "loss": 0.5981, + "step": 4629, + "task_loss": 1.6177477836608887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5307059288024902, + "epoch": 3.91, + "learning_rate": 3.043110735418428e-05, + "loss": 0.5653, + "step": 4630, + "task_loss": 0.9362890720367432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3964237570762634, + "epoch": 3.91, + "learning_rate": 3.04268808114962e-05, + "loss": 0.6331, + "step": 4631, + "task_loss": 0.5317371487617493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7293582558631897, + "epoch": 3.91, + "learning_rate": 3.0422654268808115e-05, + "loss": 0.6117, + "step": 4632, + "task_loss": 1.1754529476165771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5061864256858826, + "epoch": 3.92, + "learning_rate": 3.0418427726120035e-05, + "loss": 0.5866, + "step": 4633, + "task_loss": 0.6947102546691895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39274168014526367, + "epoch": 3.92, + "learning_rate": 3.0414201183431958e-05, + "loss": 0.5249, + "step": 4634, + "task_loss": 0.17787891626358032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4959263205528259, + "epoch": 3.92, + "learning_rate": 3.040997464074387e-05, + "loss": 0.6468, + "step": 4635, + "task_loss": 1.0032756328582764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5696570873260498, + "epoch": 3.92, + "learning_rate": 3.040574809805579e-05, + "loss": 0.5324, + "step": 4636, + "task_loss": 0.22751933336257935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8945135474205017, + "epoch": 3.92, + "learning_rate": 3.0401521555367714e-05, + "loss": 0.7893, + "step": 4637, + "task_loss": 1.4217852354049683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5857203006744385, + "epoch": 3.92, + "learning_rate": 3.0397295012679627e-05, + "loss": 0.5636, + "step": 4638, + "task_loss": 0.2735403776168823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3120154142379761, + "epoch": 3.92, + "learning_rate": 3.0393068469991546e-05, + "loss": 0.6174, + "step": 4639, + "task_loss": 0.11848022043704987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3180572986602783, + "epoch": 3.92, + "learning_rate": 3.038884192730347e-05, + "loss": 0.3543, + "step": 4640, + "task_loss": 0.5309938192367554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45571303367614746, + "epoch": 3.92, + "learning_rate": 3.0384615384615382e-05, + "loss": 0.4835, + "step": 4641, + "task_loss": 0.3462219536304474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36800670623779297, + "epoch": 3.92, + "learning_rate": 3.0380388841927306e-05, + "loss": 0.612, + "step": 4642, + "task_loss": 0.04561953619122505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1671844720840454, + "epoch": 3.92, + "learning_rate": 3.0376162299239225e-05, + "loss": 0.7772, + "step": 4643, + "task_loss": 0.48168498277664185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47081223130226135, + "epoch": 3.93, + "learning_rate": 3.0371935756551145e-05, + "loss": 0.4401, + "step": 4644, + "task_loss": 0.33101022243499756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2047979831695557, + "epoch": 3.93, + "learning_rate": 3.036770921386306e-05, + "loss": 0.7449, + "step": 4645, + "task_loss": 0.6459980010986328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3985995054244995, + "epoch": 3.93, + "learning_rate": 3.036348267117498e-05, + "loss": 0.6065, + "step": 4646, + "task_loss": 0.14967221021652222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5883121490478516, + "epoch": 3.93, + "learning_rate": 3.03592561284869e-05, + "loss": 0.7161, + "step": 4647, + "task_loss": 1.446795105934143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6932694911956787, + "epoch": 3.93, + "learning_rate": 3.0355029585798817e-05, + "loss": 0.6715, + "step": 4648, + "task_loss": 1.0540391206741333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.442544162273407, + "epoch": 3.93, + "learning_rate": 3.0350803043110737e-05, + "loss": 0.5877, + "step": 4649, + "task_loss": 0.2704816460609436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.484758198261261, + "epoch": 3.93, + "learning_rate": 3.0346576500422657e-05, + "loss": 0.5702, + "step": 4650, + "task_loss": 0.31382569670677185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6236105561256409, + "epoch": 3.93, + "learning_rate": 3.0342349957734573e-05, + "loss": 0.6859, + "step": 4651, + "task_loss": 2.421828269958496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7948158979415894, + "epoch": 3.93, + "learning_rate": 3.0338123415046493e-05, + "loss": 0.6787, + "step": 4652, + "task_loss": 0.23259636759757996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4680336117744446, + "epoch": 3.93, + "learning_rate": 3.0333896872358413e-05, + "loss": 0.5938, + "step": 4653, + "task_loss": 1.1514549255371094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.701113760471344, + "epoch": 3.93, + "learning_rate": 3.032967032967033e-05, + "loss": 0.6772, + "step": 4654, + "task_loss": 0.4720182716846466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8718936443328857, + "epoch": 3.93, + "learning_rate": 3.032544378698225e-05, + "loss": 0.5461, + "step": 4655, + "task_loss": 0.4147320091724396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5229637026786804, + "epoch": 3.94, + "learning_rate": 3.032121724429417e-05, + "loss": 0.4618, + "step": 4656, + "task_loss": 0.7907066345214844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47369498014450073, + "epoch": 3.94, + "learning_rate": 3.0316990701606085e-05, + "loss": 0.4892, + "step": 4657, + "task_loss": 0.17236801981925964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5330003499984741, + "epoch": 3.94, + "learning_rate": 3.0312764158918004e-05, + "loss": 0.577, + "step": 4658, + "task_loss": 1.0430312156677246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6459700465202332, + "epoch": 3.94, + "learning_rate": 3.0308537616229928e-05, + "loss": 0.4586, + "step": 4659, + "task_loss": 0.35774412751197815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7663490176200867, + "epoch": 3.94, + "learning_rate": 3.0304311073541847e-05, + "loss": 0.7398, + "step": 4660, + "task_loss": 1.2521777153015137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48543113470077515, + "epoch": 3.94, + "learning_rate": 3.030008453085376e-05, + "loss": 0.5859, + "step": 4661, + "task_loss": 1.53238046169281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6202825307846069, + "epoch": 3.94, + "learning_rate": 3.0295857988165683e-05, + "loss": 0.5863, + "step": 4662, + "task_loss": 0.1928168684244156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4644743800163269, + "epoch": 3.94, + "learning_rate": 3.0291631445477603e-05, + "loss": 0.5111, + "step": 4663, + "task_loss": 0.37532007694244385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4748108386993408, + "epoch": 3.94, + "learning_rate": 3.028740490278952e-05, + "loss": 0.5409, + "step": 4664, + "task_loss": 0.2173890322446823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5967252254486084, + "epoch": 3.94, + "learning_rate": 3.028317836010144e-05, + "loss": 0.5199, + "step": 4665, + "task_loss": 1.372541069984436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48646456003189087, + "epoch": 3.94, + "learning_rate": 3.027895181741336e-05, + "loss": 0.5493, + "step": 4666, + "task_loss": 0.4623583257198334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6286332607269287, + "epoch": 3.94, + "learning_rate": 3.0274725274725275e-05, + "loss": 0.634, + "step": 4667, + "task_loss": 1.0798989534378052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4886741638183594, + "epoch": 3.95, + "learning_rate": 3.0270498732037195e-05, + "loss": 0.5292, + "step": 4668, + "task_loss": 0.8430002331733704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7612294554710388, + "epoch": 3.95, + "learning_rate": 3.0266272189349115e-05, + "loss": 0.7897, + "step": 4669, + "task_loss": 1.52606201171875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34037041664123535, + "epoch": 3.95, + "learning_rate": 3.026204564666103e-05, + "loss": 0.5062, + "step": 4670, + "task_loss": 0.47254952788352966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.562741756439209, + "epoch": 3.95, + "learning_rate": 3.025781910397295e-05, + "loss": 0.4963, + "step": 4671, + "task_loss": 0.7736961841583252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27162396907806396, + "epoch": 3.95, + "learning_rate": 3.025359256128487e-05, + "loss": 0.5941, + "step": 4672, + "task_loss": 0.8885185718536377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6105550527572632, + "epoch": 3.95, + "learning_rate": 3.024936601859679e-05, + "loss": 0.6522, + "step": 4673, + "task_loss": 0.6301946640014648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6110946536064148, + "epoch": 3.95, + "learning_rate": 3.0245139475908707e-05, + "loss": 0.7506, + "step": 4674, + "task_loss": 0.6913084983825684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7323060035705566, + "epoch": 3.95, + "learning_rate": 3.0240912933220626e-05, + "loss": 0.8128, + "step": 4675, + "task_loss": 0.5939364433288574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3728654384613037, + "epoch": 3.95, + "learning_rate": 3.023668639053255e-05, + "loss": 0.6279, + "step": 4676, + "task_loss": 0.8307532072067261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7955504059791565, + "epoch": 3.95, + "learning_rate": 3.0232459847844463e-05, + "loss": 0.4841, + "step": 4677, + "task_loss": 0.854138970375061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42262211441993713, + "epoch": 3.95, + "learning_rate": 3.0228233305156382e-05, + "loss": 0.3941, + "step": 4678, + "task_loss": 1.0354888439178467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5925530195236206, + "epoch": 3.95, + "learning_rate": 3.0224006762468305e-05, + "loss": 0.5808, + "step": 4679, + "task_loss": 0.7890202403068542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7886413335800171, + "epoch": 3.96, + "learning_rate": 3.021978021978022e-05, + "loss": 0.5881, + "step": 4680, + "task_loss": 1.035560131072998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5514804720878601, + "epoch": 3.96, + "learning_rate": 3.021555367709214e-05, + "loss": 0.7138, + "step": 4681, + "task_loss": 0.2693343460559845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5306179523468018, + "epoch": 3.96, + "learning_rate": 3.021132713440406e-05, + "loss": 0.6814, + "step": 4682, + "task_loss": 0.6615212559700012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5154081583023071, + "epoch": 3.96, + "learning_rate": 3.0207100591715974e-05, + "loss": 0.5461, + "step": 4683, + "task_loss": 0.5905464887619019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7044644951820374, + "epoch": 3.96, + "learning_rate": 3.0202874049027897e-05, + "loss": 0.69, + "step": 4684, + "task_loss": 1.3696662187576294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3044279217720032, + "epoch": 3.96, + "learning_rate": 3.0198647506339817e-05, + "loss": 0.4241, + "step": 4685, + "task_loss": 0.5317029356956482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9315177798271179, + "epoch": 3.96, + "learning_rate": 3.019442096365173e-05, + "loss": 0.7594, + "step": 4686, + "task_loss": 0.9961008429527283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48297902941703796, + "epoch": 3.96, + "learning_rate": 3.0190194420963653e-05, + "loss": 0.4391, + "step": 4687, + "task_loss": 1.5042399168014526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7176103591918945, + "epoch": 3.96, + "learning_rate": 3.0185967878275573e-05, + "loss": 0.6331, + "step": 4688, + "task_loss": 0.3522810935974121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5474249124526978, + "epoch": 3.96, + "learning_rate": 3.0181741335587493e-05, + "loss": 0.5534, + "step": 4689, + "task_loss": 0.903675377368927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.578201174736023, + "epoch": 3.96, + "learning_rate": 3.017751479289941e-05, + "loss": 0.5727, + "step": 4690, + "task_loss": 0.34195613861083984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6848090887069702, + "epoch": 3.96, + "learning_rate": 3.017328825021133e-05, + "loss": 0.6145, + "step": 4691, + "task_loss": 1.541022539138794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8923132419586182, + "epoch": 3.97, + "learning_rate": 3.016906170752325e-05, + "loss": 0.6219, + "step": 4692, + "task_loss": 0.8312207460403442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6767929196357727, + "epoch": 3.97, + "learning_rate": 3.0164835164835165e-05, + "loss": 0.7494, + "step": 4693, + "task_loss": 1.2637939453125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3764417767524719, + "epoch": 3.97, + "learning_rate": 3.0160608622147085e-05, + "loss": 0.503, + "step": 4694, + "task_loss": 0.7010582685470581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4886600375175476, + "epoch": 3.97, + "learning_rate": 3.0156382079459004e-05, + "loss": 0.5096, + "step": 4695, + "task_loss": 0.35994237661361694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5162187814712524, + "epoch": 3.97, + "learning_rate": 3.015215553677092e-05, + "loss": 0.5042, + "step": 4696, + "task_loss": 0.9106240272521973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3719951808452606, + "epoch": 3.97, + "learning_rate": 3.014792899408284e-05, + "loss": 0.4696, + "step": 4697, + "task_loss": 0.425300270318985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5133061408996582, + "epoch": 3.97, + "learning_rate": 3.0143702451394764e-05, + "loss": 0.4871, + "step": 4698, + "task_loss": 1.1612643003463745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5151873826980591, + "epoch": 3.97, + "learning_rate": 3.0139475908706676e-05, + "loss": 0.5259, + "step": 4699, + "task_loss": 0.6632486581802368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26280128955841064, + "epoch": 3.97, + "learning_rate": 3.0135249366018596e-05, + "loss": 0.4463, + "step": 4700, + "task_loss": 0.6082665920257568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47703397274017334, + "epoch": 3.97, + "learning_rate": 3.013102282333052e-05, + "loss": 0.6013, + "step": 4701, + "task_loss": 0.7142811417579651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7605900764465332, + "epoch": 3.97, + "learning_rate": 3.012679628064244e-05, + "loss": 0.6102, + "step": 4702, + "task_loss": 0.35928454995155334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8163631558418274, + "epoch": 3.97, + "learning_rate": 3.0122569737954352e-05, + "loss": 0.736, + "step": 4703, + "task_loss": 0.5575838088989258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5625446438789368, + "epoch": 3.98, + "learning_rate": 3.0118343195266275e-05, + "loss": 0.7405, + "step": 4704, + "task_loss": 0.39987924695014954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5083070397377014, + "epoch": 3.98, + "learning_rate": 3.0114116652578195e-05, + "loss": 0.6836, + "step": 4705, + "task_loss": 0.6831318140029907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6689057946205139, + "epoch": 3.98, + "learning_rate": 3.010989010989011e-05, + "loss": 0.6175, + "step": 4706, + "task_loss": 0.9765880703926086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.788018524646759, + "epoch": 3.98, + "learning_rate": 3.010566356720203e-05, + "loss": 0.6306, + "step": 4707, + "task_loss": 0.49589768052101135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.613419771194458, + "epoch": 3.98, + "learning_rate": 3.010143702451395e-05, + "loss": 0.6317, + "step": 4708, + "task_loss": 0.5823269486427307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.561852216720581, + "epoch": 3.98, + "learning_rate": 3.0097210481825867e-05, + "loss": 0.6183, + "step": 4709, + "task_loss": 0.5153210163116455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6073218584060669, + "epoch": 3.98, + "learning_rate": 3.0092983939137787e-05, + "loss": 0.7494, + "step": 4710, + "task_loss": 0.6586194038391113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27364081144332886, + "epoch": 3.98, + "learning_rate": 3.0088757396449707e-05, + "loss": 0.5944, + "step": 4711, + "task_loss": 0.3483925461769104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5581334829330444, + "epoch": 3.98, + "learning_rate": 3.0084530853761623e-05, + "loss": 0.6322, + "step": 4712, + "task_loss": 1.072516918182373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3722652792930603, + "epoch": 3.98, + "learning_rate": 3.0080304311073543e-05, + "loss": 0.4809, + "step": 4713, + "task_loss": 0.7023463249206543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6974066495895386, + "epoch": 3.98, + "learning_rate": 3.0076077768385462e-05, + "loss": 0.8307, + "step": 4714, + "task_loss": 0.6060879826545715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45256704092025757, + "epoch": 3.99, + "learning_rate": 3.007185122569738e-05, + "loss": 0.5379, + "step": 4715, + "task_loss": 0.5103746652603149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40717703104019165, + "epoch": 3.99, + "learning_rate": 3.00676246830093e-05, + "loss": 0.6681, + "step": 4716, + "task_loss": 0.19918246567249298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5772171020507812, + "epoch": 3.99, + "learning_rate": 3.0063398140321218e-05, + "loss": 0.5883, + "step": 4717, + "task_loss": 0.4013548493385315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5430459380149841, + "epoch": 3.99, + "learning_rate": 3.005917159763314e-05, + "loss": 0.5283, + "step": 4718, + "task_loss": 0.3575628995895386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33911755681037903, + "epoch": 3.99, + "learning_rate": 3.0054945054945054e-05, + "loss": 0.6451, + "step": 4719, + "task_loss": 0.2961440980434418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5955850481987, + "epoch": 3.99, + "learning_rate": 3.0050718512256974e-05, + "loss": 0.5115, + "step": 4720, + "task_loss": 0.9544125199317932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40576303005218506, + "epoch": 3.99, + "learning_rate": 3.0046491969568897e-05, + "loss": 0.4771, + "step": 4721, + "task_loss": 0.7533657550811768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9449419975280762, + "epoch": 3.99, + "learning_rate": 3.004226542688081e-05, + "loss": 0.6291, + "step": 4722, + "task_loss": 1.2539595365524292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2913803160190582, + "epoch": 3.99, + "learning_rate": 3.0038038884192733e-05, + "loss": 0.5302, + "step": 4723, + "task_loss": 0.43162453174591064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36746746301651, + "epoch": 3.99, + "learning_rate": 3.0033812341504653e-05, + "loss": 0.5239, + "step": 4724, + "task_loss": 0.5555220246315002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7682191133499146, + "epoch": 3.99, + "learning_rate": 3.0029585798816566e-05, + "loss": 0.6528, + "step": 4725, + "task_loss": 0.3656502068042755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5036412477493286, + "epoch": 3.99, + "learning_rate": 3.002535925612849e-05, + "loss": 0.4916, + "step": 4726, + "task_loss": 0.2712375819683075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9559680223464966, + "epoch": 4.0, + "learning_rate": 3.002113271344041e-05, + "loss": 0.6769, + "step": 4727, + "task_loss": 1.4650204181671143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6178359985351562, + "epoch": 4.0, + "learning_rate": 3.0016906170752325e-05, + "loss": 0.5574, + "step": 4728, + "task_loss": 1.5286884307861328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19341501593589783, + "epoch": 4.0, + "learning_rate": 3.0012679628064245e-05, + "loss": 0.3833, + "step": 4729, + "task_loss": 0.07825923711061478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7431408166885376, + "epoch": 4.0, + "learning_rate": 3.0008453085376165e-05, + "loss": 0.8137, + "step": 4730, + "task_loss": 1.092724323272705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8021806478500366, + "epoch": 4.0, + "learning_rate": 3.0004226542688084e-05, + "loss": 0.5668, + "step": 4731, + "task_loss": 1.0776853561401367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5641326904296875, + "epoch": 4.0, + "learning_rate": 3e-05, + "loss": 0.6357, + "step": 4732, + "task_loss": 0.3521232008934021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39596685767173767, + "epoch": 4.0, + "learning_rate": 2.999577345731192e-05, + "loss": 0.7313, + "step": 4733, + "task_loss": 0.8014976978302002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7462252974510193, + "epoch": 4.0, + "learning_rate": 2.999154691462384e-05, + "loss": 0.691, + "step": 4734, + "task_loss": 1.1207897663116455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4042583703994751, + "epoch": 4.0, + "learning_rate": 2.9987320371935757e-05, + "loss": 0.5421, + "step": 4735, + "task_loss": 1.1607073545455933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6367651224136353, + "epoch": 4.0, + "learning_rate": 2.9983093829247676e-05, + "loss": 0.5674, + "step": 4736, + "task_loss": 1.2157838344573975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4351539611816406, + "epoch": 4.0, + "learning_rate": 2.9978867286559596e-05, + "loss": 0.5, + "step": 4737, + "task_loss": 0.20542965829372406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5558850169181824, + "epoch": 4.01, + "learning_rate": 2.9974640743871512e-05, + "loss": 0.4686, + "step": 4738, + "task_loss": 0.48375123739242554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7042853236198425, + "epoch": 4.01, + "learning_rate": 2.9970414201183432e-05, + "loss": 0.6005, + "step": 4739, + "task_loss": 0.40970611572265625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34301328659057617, + "epoch": 4.01, + "learning_rate": 2.9966187658495355e-05, + "loss": 0.5167, + "step": 4740, + "task_loss": 0.31376785039901733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4341970682144165, + "epoch": 4.01, + "learning_rate": 2.9961961115807268e-05, + "loss": 0.5621, + "step": 4741, + "task_loss": 0.2768462896347046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8227153420448303, + "epoch": 4.01, + "learning_rate": 2.9957734573119188e-05, + "loss": 0.6527, + "step": 4742, + "task_loss": 0.26090165972709656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28555893898010254, + "epoch": 4.01, + "learning_rate": 2.995350803043111e-05, + "loss": 0.5548, + "step": 4743, + "task_loss": 0.05753535404801369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41741886734962463, + "epoch": 4.01, + "learning_rate": 2.9949281487743024e-05, + "loss": 0.5999, + "step": 4744, + "task_loss": 0.6317171454429626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45832276344299316, + "epoch": 4.01, + "learning_rate": 2.9945054945054947e-05, + "loss": 0.5088, + "step": 4745, + "task_loss": 0.4320179224014282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4288496971130371, + "epoch": 4.01, + "learning_rate": 2.9940828402366867e-05, + "loss": 0.4188, + "step": 4746, + "task_loss": 0.2928406000137329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0759358406066895, + "epoch": 4.01, + "learning_rate": 2.9936601859678787e-05, + "loss": 0.7903, + "step": 4747, + "task_loss": 0.31203359365463257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5567479133605957, + "epoch": 4.01, + "learning_rate": 2.9932375316990703e-05, + "loss": 0.7488, + "step": 4748, + "task_loss": 0.7170420289039612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0591440200805664, + "epoch": 4.01, + "learning_rate": 2.9928148774302623e-05, + "loss": 0.7233, + "step": 4749, + "task_loss": 0.6277914643287659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3386024832725525, + "epoch": 4.02, + "learning_rate": 2.9923922231614543e-05, + "loss": 0.4745, + "step": 4750, + "task_loss": 0.5792732238769531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3787451684474945, + "epoch": 4.02, + "learning_rate": 2.991969568892646e-05, + "loss": 0.4155, + "step": 4751, + "task_loss": 0.39815253019332886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6467211842536926, + "epoch": 4.02, + "learning_rate": 2.991546914623838e-05, + "loss": 0.7457, + "step": 4752, + "task_loss": 0.872768223285675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5191845893859863, + "epoch": 4.02, + "learning_rate": 2.99112426035503e-05, + "loss": 0.7095, + "step": 4753, + "task_loss": 0.8604825735092163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46322154998779297, + "epoch": 4.02, + "learning_rate": 2.9907016060862215e-05, + "loss": 0.6925, + "step": 4754, + "task_loss": 0.13558031618595123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5248489379882812, + "epoch": 4.02, + "learning_rate": 2.9902789518174134e-05, + "loss": 0.5462, + "step": 4755, + "task_loss": 0.6144562363624573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3355175256729126, + "epoch": 4.02, + "learning_rate": 2.9898562975486054e-05, + "loss": 0.647, + "step": 4756, + "task_loss": 0.44851887226104736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6245940923690796, + "epoch": 4.02, + "learning_rate": 2.989433643279797e-05, + "loss": 0.4949, + "step": 4757, + "task_loss": 0.978074848651886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38768285512924194, + "epoch": 4.02, + "learning_rate": 2.989010989010989e-05, + "loss": 0.6135, + "step": 4758, + "task_loss": 0.23951290547847748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5809382200241089, + "epoch": 4.02, + "learning_rate": 2.988588334742181e-05, + "loss": 0.5323, + "step": 4759, + "task_loss": 0.8166263699531555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3891773521900177, + "epoch": 4.02, + "learning_rate": 2.9881656804733733e-05, + "loss": 0.4029, + "step": 4760, + "task_loss": 0.4305973947048187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0928038358688354, + "epoch": 4.02, + "learning_rate": 2.9877430262045646e-05, + "loss": 0.651, + "step": 4761, + "task_loss": 1.4423768520355225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5874948501586914, + "epoch": 4.03, + "learning_rate": 2.987320371935757e-05, + "loss": 0.6702, + "step": 4762, + "task_loss": 0.8745338916778564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.682579755783081, + "epoch": 4.03, + "learning_rate": 2.986897717666949e-05, + "loss": 0.5267, + "step": 4763, + "task_loss": 0.46137967705726624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5621275305747986, + "epoch": 4.03, + "learning_rate": 2.9864750633981402e-05, + "loss": 0.5376, + "step": 4764, + "task_loss": 0.43116816878318787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5357831716537476, + "epoch": 4.03, + "learning_rate": 2.9860524091293325e-05, + "loss": 0.6038, + "step": 4765, + "task_loss": 0.593547523021698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33915096521377563, + "epoch": 4.03, + "learning_rate": 2.9856297548605245e-05, + "loss": 0.554, + "step": 4766, + "task_loss": 0.6328922510147095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4777359962463379, + "epoch": 4.03, + "learning_rate": 2.9852071005917158e-05, + "loss": 0.4684, + "step": 4767, + "task_loss": 0.12932059168815613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4044278860092163, + "epoch": 4.03, + "learning_rate": 2.984784446322908e-05, + "loss": 0.3792, + "step": 4768, + "task_loss": 0.45302632451057434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5154479742050171, + "epoch": 4.03, + "learning_rate": 2.9843617920541e-05, + "loss": 0.367, + "step": 4769, + "task_loss": 0.4945833384990692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8677091002464294, + "epoch": 4.03, + "learning_rate": 2.9839391377852917e-05, + "loss": 0.7079, + "step": 4770, + "task_loss": 0.4092583656311035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24866575002670288, + "epoch": 4.03, + "learning_rate": 2.9835164835164837e-05, + "loss": 0.3416, + "step": 4771, + "task_loss": 0.21698196232318878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37771540880203247, + "epoch": 4.03, + "learning_rate": 2.9830938292476756e-05, + "loss": 0.4638, + "step": 4772, + "task_loss": 0.4372885227203369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.278551459312439, + "epoch": 4.03, + "learning_rate": 2.9826711749788673e-05, + "loss": 0.6601, + "step": 4773, + "task_loss": 1.5556211471557617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36623716354370117, + "epoch": 4.04, + "learning_rate": 2.9822485207100593e-05, + "loss": 0.4309, + "step": 4774, + "task_loss": 0.3765921890735626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4066579043865204, + "epoch": 4.04, + "learning_rate": 2.9818258664412512e-05, + "loss": 0.455, + "step": 4775, + "task_loss": 0.5585378408432007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4618692100048065, + "epoch": 4.04, + "learning_rate": 2.9814032121724432e-05, + "loss": 0.5495, + "step": 4776, + "task_loss": 0.5024696588516235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3278048634529114, + "epoch": 4.04, + "learning_rate": 2.980980557903635e-05, + "loss": 0.4986, + "step": 4777, + "task_loss": 1.075730800628662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5876538753509521, + "epoch": 4.04, + "learning_rate": 2.9805579036348268e-05, + "loss": 0.6961, + "step": 4778, + "task_loss": 0.7248564958572388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6468080878257751, + "epoch": 4.04, + "learning_rate": 2.980135249366019e-05, + "loss": 0.5761, + "step": 4779, + "task_loss": 1.5820600986480713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3380969166755676, + "epoch": 4.04, + "learning_rate": 2.9797125950972104e-05, + "loss": 0.5324, + "step": 4780, + "task_loss": 0.9099416136741638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6064426898956299, + "epoch": 4.04, + "learning_rate": 2.9792899408284024e-05, + "loss": 0.5628, + "step": 4781, + "task_loss": 0.8687354326248169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4320724606513977, + "epoch": 4.04, + "learning_rate": 2.9788672865595947e-05, + "loss": 0.5117, + "step": 4782, + "task_loss": 0.48203763365745544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4345114529132843, + "epoch": 4.04, + "learning_rate": 2.978444632290786e-05, + "loss": 0.3607, + "step": 4783, + "task_loss": 0.5369747877120972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40863925218582153, + "epoch": 4.04, + "learning_rate": 2.978021978021978e-05, + "loss": 0.4975, + "step": 4784, + "task_loss": 0.716952919960022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.471333384513855, + "epoch": 4.04, + "learning_rate": 2.9775993237531703e-05, + "loss": 0.6046, + "step": 4785, + "task_loss": 0.4160959720611572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5143387913703918, + "epoch": 4.05, + "learning_rate": 2.9771766694843616e-05, + "loss": 0.5247, + "step": 4786, + "task_loss": 0.6719158291816711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3797294795513153, + "epoch": 4.05, + "learning_rate": 2.976754015215554e-05, + "loss": 0.6387, + "step": 4787, + "task_loss": 0.6243707537651062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5103291273117065, + "epoch": 4.05, + "learning_rate": 2.976331360946746e-05, + "loss": 0.4673, + "step": 4788, + "task_loss": 0.9570773839950562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5803094506263733, + "epoch": 4.05, + "learning_rate": 2.975908706677938e-05, + "loss": 0.5807, + "step": 4789, + "task_loss": 0.5373027920722961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9603193998336792, + "epoch": 4.05, + "learning_rate": 2.9754860524091295e-05, + "loss": 0.8264, + "step": 4790, + "task_loss": 0.5363649129867554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20841357111930847, + "epoch": 4.05, + "learning_rate": 2.9750633981403215e-05, + "loss": 0.3852, + "step": 4791, + "task_loss": 0.09408742934465408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3148472309112549, + "epoch": 4.05, + "learning_rate": 2.9746407438715134e-05, + "loss": 0.4742, + "step": 4792, + "task_loss": 0.8518476486206055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5975577235221863, + "epoch": 4.05, + "learning_rate": 2.974218089602705e-05, + "loss": 0.519, + "step": 4793, + "task_loss": 0.551813542842865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3116340637207031, + "epoch": 4.05, + "learning_rate": 2.973795435333897e-05, + "loss": 0.4622, + "step": 4794, + "task_loss": 0.10413404554128647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3479987680912018, + "epoch": 4.05, + "learning_rate": 2.973372781065089e-05, + "loss": 0.4832, + "step": 4795, + "task_loss": 0.023804357275366783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8424704074859619, + "epoch": 4.05, + "learning_rate": 2.9729501267962806e-05, + "loss": 0.5387, + "step": 4796, + "task_loss": 0.8505396842956543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.439983069896698, + "epoch": 4.05, + "learning_rate": 2.9725274725274726e-05, + "loss": 0.4352, + "step": 4797, + "task_loss": 0.8187170624732971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35773593187332153, + "epoch": 4.06, + "learning_rate": 2.9721048182586646e-05, + "loss": 0.4926, + "step": 4798, + "task_loss": 0.3739623427391052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42781004309654236, + "epoch": 4.06, + "learning_rate": 2.9716821639898562e-05, + "loss": 0.444, + "step": 4799, + "task_loss": 0.641234278678894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7133920192718506, + "epoch": 4.06, + "learning_rate": 2.9712595097210482e-05, + "loss": 0.5427, + "step": 4800, + "task_loss": 0.1334468424320221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5091303586959839, + "epoch": 4.06, + "learning_rate": 2.9708368554522402e-05, + "loss": 0.5804, + "step": 4801, + "task_loss": 0.9751131534576416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26588380336761475, + "epoch": 4.06, + "learning_rate": 2.9704142011834318e-05, + "loss": 0.5459, + "step": 4802, + "task_loss": 0.09333113580942154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37200456857681274, + "epoch": 4.06, + "learning_rate": 2.9699915469146238e-05, + "loss": 0.5496, + "step": 4803, + "task_loss": 0.33158430457115173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4736981689929962, + "epoch": 4.06, + "learning_rate": 2.969568892645816e-05, + "loss": 0.5455, + "step": 4804, + "task_loss": 1.27243971824646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20986230671405792, + "epoch": 4.06, + "learning_rate": 2.969146238377008e-05, + "loss": 0.5106, + "step": 4805, + "task_loss": 0.2970461845397949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43755829334259033, + "epoch": 4.06, + "learning_rate": 2.9687235841081994e-05, + "loss": 0.6134, + "step": 4806, + "task_loss": 0.5337417721748352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0892823934555054, + "epoch": 4.06, + "learning_rate": 2.9683009298393917e-05, + "loss": 0.7501, + "step": 4807, + "task_loss": 0.5199514031410217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6703680753707886, + "epoch": 4.06, + "learning_rate": 2.9678782755705837e-05, + "loss": 0.763, + "step": 4808, + "task_loss": 0.771691620349884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8760867118835449, + "epoch": 4.07, + "learning_rate": 2.9674556213017753e-05, + "loss": 0.5981, + "step": 4809, + "task_loss": 1.4003020524978638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36402013897895813, + "epoch": 4.07, + "learning_rate": 2.9670329670329673e-05, + "loss": 0.5352, + "step": 4810, + "task_loss": 0.2539736032485962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2363978624343872, + "epoch": 4.07, + "learning_rate": 2.9666103127641592e-05, + "loss": 0.5672, + "step": 4811, + "task_loss": 0.3959346413612366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3834642767906189, + "epoch": 4.07, + "learning_rate": 2.966187658495351e-05, + "loss": 0.3952, + "step": 4812, + "task_loss": 1.3377703428268433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39688557386398315, + "epoch": 4.07, + "learning_rate": 2.965765004226543e-05, + "loss": 0.5489, + "step": 4813, + "task_loss": 0.824809730052948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5481680631637573, + "epoch": 4.07, + "learning_rate": 2.9653423499577348e-05, + "loss": 0.6434, + "step": 4814, + "task_loss": 0.7854126691818237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5738096833229065, + "epoch": 4.07, + "learning_rate": 2.9649196956889265e-05, + "loss": 0.5896, + "step": 4815, + "task_loss": 0.18337300419807434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4052640199661255, + "epoch": 4.07, + "learning_rate": 2.9644970414201184e-05, + "loss": 0.4623, + "step": 4816, + "task_loss": 0.24387280642986298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5931398868560791, + "epoch": 4.07, + "learning_rate": 2.9640743871513104e-05, + "loss": 0.638, + "step": 4817, + "task_loss": 1.1006051301956177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.537839412689209, + "epoch": 4.07, + "learning_rate": 2.9636517328825024e-05, + "loss": 0.4506, + "step": 4818, + "task_loss": 0.612963080406189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.760231614112854, + "epoch": 4.07, + "learning_rate": 2.963229078613694e-05, + "loss": 0.4802, + "step": 4819, + "task_loss": 0.6048200726509094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5098378658294678, + "epoch": 4.07, + "learning_rate": 2.962806424344886e-05, + "loss": 0.4477, + "step": 4820, + "task_loss": 0.5537385940551758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38817596435546875, + "epoch": 4.08, + "learning_rate": 2.9623837700760783e-05, + "loss": 0.5129, + "step": 4821, + "task_loss": 0.5576894283294678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4731392562389374, + "epoch": 4.08, + "learning_rate": 2.9619611158072696e-05, + "loss": 0.706, + "step": 4822, + "task_loss": 0.5801501274108887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4706273376941681, + "epoch": 4.08, + "learning_rate": 2.9615384615384616e-05, + "loss": 0.5677, + "step": 4823, + "task_loss": 0.632493793964386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4360937476158142, + "epoch": 4.08, + "learning_rate": 2.961115807269654e-05, + "loss": 0.5073, + "step": 4824, + "task_loss": 0.4308849275112152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5204259157180786, + "epoch": 4.08, + "learning_rate": 2.9606931530008452e-05, + "loss": 0.5955, + "step": 4825, + "task_loss": 0.6966062188148499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8863681554794312, + "epoch": 4.08, + "learning_rate": 2.9602704987320375e-05, + "loss": 0.5645, + "step": 4826, + "task_loss": 0.6026942729949951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3148103952407837, + "epoch": 4.08, + "learning_rate": 2.9598478444632295e-05, + "loss": 0.5269, + "step": 4827, + "task_loss": 0.35631802678108215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5453001856803894, + "epoch": 4.08, + "learning_rate": 2.9594251901944208e-05, + "loss": 0.6453, + "step": 4828, + "task_loss": 0.5687766075134277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36892998218536377, + "epoch": 4.08, + "learning_rate": 2.959002535925613e-05, + "loss": 0.5251, + "step": 4829, + "task_loss": 0.6104440093040466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39301788806915283, + "epoch": 4.08, + "learning_rate": 2.958579881656805e-05, + "loss": 0.4952, + "step": 4830, + "task_loss": 0.9034949541091919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34641480445861816, + "epoch": 4.08, + "learning_rate": 2.9581572273879963e-05, + "loss": 0.5685, + "step": 4831, + "task_loss": 0.6105297207832336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6269800662994385, + "epoch": 4.08, + "learning_rate": 2.9577345731191887e-05, + "loss": 0.5792, + "step": 4832, + "task_loss": 1.1485555171966553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4222157895565033, + "epoch": 4.09, + "learning_rate": 2.9573119188503806e-05, + "loss": 0.6112, + "step": 4833, + "task_loss": 1.414275050163269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3817088305950165, + "epoch": 4.09, + "learning_rate": 2.9568892645815726e-05, + "loss": 0.4512, + "step": 4834, + "task_loss": 0.7417781949043274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4314614534378052, + "epoch": 4.09, + "learning_rate": 2.9564666103127642e-05, + "loss": 0.4335, + "step": 4835, + "task_loss": 0.5572116374969482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.669198751449585, + "epoch": 4.09, + "learning_rate": 2.9560439560439562e-05, + "loss": 0.7072, + "step": 4836, + "task_loss": 1.3890420198440552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42207905650138855, + "epoch": 4.09, + "learning_rate": 2.9556213017751482e-05, + "loss": 0.516, + "step": 4837, + "task_loss": 0.761085033416748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7370420694351196, + "epoch": 4.09, + "learning_rate": 2.9551986475063398e-05, + "loss": 0.7362, + "step": 4838, + "task_loss": 1.9911420345306396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5360132455825806, + "epoch": 4.09, + "learning_rate": 2.9547759932375318e-05, + "loss": 0.6671, + "step": 4839, + "task_loss": 0.6991643309593201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6822996139526367, + "epoch": 4.09, + "learning_rate": 2.9543533389687238e-05, + "loss": 0.5954, + "step": 4840, + "task_loss": 0.5343037247657776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38074225187301636, + "epoch": 4.09, + "learning_rate": 2.9539306846999154e-05, + "loss": 0.5714, + "step": 4841, + "task_loss": 0.8787013292312622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5015571117401123, + "epoch": 4.09, + "learning_rate": 2.9535080304311074e-05, + "loss": 0.6689, + "step": 4842, + "task_loss": 0.6182490587234497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3814687430858612, + "epoch": 4.09, + "learning_rate": 2.9530853761622997e-05, + "loss": 0.4261, + "step": 4843, + "task_loss": 0.8015026450157166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6589394807815552, + "epoch": 4.09, + "learning_rate": 2.952662721893491e-05, + "loss": 0.5177, + "step": 4844, + "task_loss": 0.6222488880157471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6663868427276611, + "epoch": 4.1, + "learning_rate": 2.952240067624683e-05, + "loss": 0.5494, + "step": 4845, + "task_loss": 1.3003631830215454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.584073543548584, + "epoch": 4.1, + "learning_rate": 2.9518174133558753e-05, + "loss": 0.6328, + "step": 4846, + "task_loss": 0.6526092886924744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7385438680648804, + "epoch": 4.1, + "learning_rate": 2.9513947590870672e-05, + "loss": 0.487, + "step": 4847, + "task_loss": 0.4470556974411011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4144344925880432, + "epoch": 4.1, + "learning_rate": 2.9509721048182585e-05, + "loss": 0.6132, + "step": 4848, + "task_loss": 0.8664945363998413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7791794538497925, + "epoch": 4.1, + "learning_rate": 2.950549450549451e-05, + "loss": 0.6175, + "step": 4849, + "task_loss": 0.5026871562004089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5542319416999817, + "epoch": 4.1, + "learning_rate": 2.9501267962806428e-05, + "loss": 0.5481, + "step": 4850, + "task_loss": 0.5554592609405518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32958918809890747, + "epoch": 4.1, + "learning_rate": 2.9497041420118345e-05, + "loss": 0.4215, + "step": 4851, + "task_loss": 0.2574290931224823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4067208766937256, + "epoch": 4.1, + "learning_rate": 2.9492814877430264e-05, + "loss": 0.5335, + "step": 4852, + "task_loss": 0.6319681406021118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35255634784698486, + "epoch": 4.1, + "learning_rate": 2.9488588334742184e-05, + "loss": 0.462, + "step": 4853, + "task_loss": 0.22451795637607574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5688884258270264, + "epoch": 4.1, + "learning_rate": 2.94843617920541e-05, + "loss": 0.5155, + "step": 4854, + "task_loss": 0.1819257140159607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5997670292854309, + "epoch": 4.1, + "learning_rate": 2.948013524936602e-05, + "loss": 0.7225, + "step": 4855, + "task_loss": 0.46337664127349854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45964354276657104, + "epoch": 4.1, + "learning_rate": 2.947590870667794e-05, + "loss": 0.4617, + "step": 4856, + "task_loss": 0.6211599111557007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9337481260299683, + "epoch": 4.11, + "learning_rate": 2.9471682163989856e-05, + "loss": 0.6404, + "step": 4857, + "task_loss": 0.7539032697677612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9196542501449585, + "epoch": 4.11, + "learning_rate": 2.9467455621301776e-05, + "loss": 0.7256, + "step": 4858, + "task_loss": 1.547803521156311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.670647144317627, + "epoch": 4.11, + "learning_rate": 2.9463229078613696e-05, + "loss": 0.4658, + "step": 4859, + "task_loss": 0.6859316229820251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8582586050033569, + "epoch": 4.11, + "learning_rate": 2.9459002535925612e-05, + "loss": 0.5858, + "step": 4860, + "task_loss": 0.40182241797447205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6735626459121704, + "epoch": 4.11, + "learning_rate": 2.9454775993237532e-05, + "loss": 0.5826, + "step": 4861, + "task_loss": 1.1858372688293457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7336824536323547, + "epoch": 4.11, + "learning_rate": 2.945054945054945e-05, + "loss": 0.6855, + "step": 4862, + "task_loss": 1.2073827981948853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6279140114784241, + "epoch": 4.11, + "learning_rate": 2.9446322907861375e-05, + "loss": 0.4896, + "step": 4863, + "task_loss": 1.4833605289459229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4476397633552551, + "epoch": 4.11, + "learning_rate": 2.9442096365173288e-05, + "loss": 0.462, + "step": 4864, + "task_loss": 0.9270361661911011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0851538181304932, + "epoch": 4.11, + "learning_rate": 2.9437869822485207e-05, + "loss": 0.7726, + "step": 4865, + "task_loss": 0.8774154782295227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4475991129875183, + "epoch": 4.11, + "learning_rate": 2.943364327979713e-05, + "loss": 0.4685, + "step": 4866, + "task_loss": 0.8371140360832214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6269153952598572, + "epoch": 4.11, + "learning_rate": 2.9429416737109043e-05, + "loss": 0.5023, + "step": 4867, + "task_loss": 0.9607031345367432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49514877796173096, + "epoch": 4.11, + "learning_rate": 2.9425190194420967e-05, + "loss": 0.5629, + "step": 4868, + "task_loss": 1.8721123933792114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16147325932979584, + "epoch": 4.12, + "learning_rate": 2.9420963651732886e-05, + "loss": 0.4341, + "step": 4869, + "task_loss": 0.048075880855321884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7240965366363525, + "epoch": 4.12, + "learning_rate": 2.94167371090448e-05, + "loss": 0.6844, + "step": 4870, + "task_loss": 0.9053167700767517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5797220468521118, + "epoch": 4.12, + "learning_rate": 2.9412510566356722e-05, + "loss": 0.6192, + "step": 4871, + "task_loss": 0.602137565612793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6932028532028198, + "epoch": 4.12, + "learning_rate": 2.9408284023668642e-05, + "loss": 0.5983, + "step": 4872, + "task_loss": 0.8830226063728333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5245742797851562, + "epoch": 4.12, + "learning_rate": 2.940405748098056e-05, + "loss": 0.3909, + "step": 4873, + "task_loss": 0.4560994505882263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4908671975135803, + "epoch": 4.12, + "learning_rate": 2.9399830938292478e-05, + "loss": 0.4516, + "step": 4874, + "task_loss": 0.7425429224967957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5652753114700317, + "epoch": 4.12, + "learning_rate": 2.9395604395604398e-05, + "loss": 0.6313, + "step": 4875, + "task_loss": 0.33220943808555603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48626336455345154, + "epoch": 4.12, + "learning_rate": 2.9391377852916318e-05, + "loss": 0.442, + "step": 4876, + "task_loss": 0.45793113112449646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7293793559074402, + "epoch": 4.12, + "learning_rate": 2.9387151310228234e-05, + "loss": 0.5456, + "step": 4877, + "task_loss": 1.1115078926086426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5300695896148682, + "epoch": 4.12, + "learning_rate": 2.9382924767540154e-05, + "loss": 0.5893, + "step": 4878, + "task_loss": 0.3955920934677124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35069793462753296, + "epoch": 4.12, + "learning_rate": 2.9378698224852074e-05, + "loss": 0.542, + "step": 4879, + "task_loss": 0.39194342494010925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.311199814081192, + "epoch": 4.13, + "learning_rate": 2.937447168216399e-05, + "loss": 0.5636, + "step": 4880, + "task_loss": 0.1707916259765625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31814074516296387, + "epoch": 4.13, + "learning_rate": 2.937024513947591e-05, + "loss": 0.4404, + "step": 4881, + "task_loss": 0.18268778920173645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6300954222679138, + "epoch": 4.13, + "learning_rate": 2.936601859678783e-05, + "loss": 0.6501, + "step": 4882, + "task_loss": 1.3882330656051636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.305836945772171, + "epoch": 4.13, + "learning_rate": 2.9361792054099746e-05, + "loss": 0.5197, + "step": 4883, + "task_loss": 1.227432131767273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6732820272445679, + "epoch": 4.13, + "learning_rate": 2.9357565511411666e-05, + "loss": 0.583, + "step": 4884, + "task_loss": 0.816556990146637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5725735425949097, + "epoch": 4.13, + "learning_rate": 2.935333896872359e-05, + "loss": 0.455, + "step": 4885, + "task_loss": 0.1542321741580963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5702890157699585, + "epoch": 4.13, + "learning_rate": 2.93491124260355e-05, + "loss": 0.5776, + "step": 4886, + "task_loss": 0.2596447765827179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33251917362213135, + "epoch": 4.13, + "learning_rate": 2.934488588334742e-05, + "loss": 0.3957, + "step": 4887, + "task_loss": 0.1093897745013237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4518659710884094, + "epoch": 4.13, + "learning_rate": 2.9340659340659344e-05, + "loss": 0.5479, + "step": 4888, + "task_loss": 0.7237752079963684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36843740940093994, + "epoch": 4.13, + "learning_rate": 2.9336432797971257e-05, + "loss": 0.4508, + "step": 4889, + "task_loss": 0.15715515613555908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4993230104446411, + "epoch": 4.13, + "learning_rate": 2.933220625528318e-05, + "loss": 0.6884, + "step": 4890, + "task_loss": 1.0155936479568481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4642319083213806, + "epoch": 4.13, + "learning_rate": 2.93279797125951e-05, + "loss": 0.6264, + "step": 4891, + "task_loss": 0.5515473484992981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5643899440765381, + "epoch": 4.14, + "learning_rate": 2.932375316990702e-05, + "loss": 0.5864, + "step": 4892, + "task_loss": 0.9507386684417725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5338196754455566, + "epoch": 4.14, + "learning_rate": 2.9319526627218936e-05, + "loss": 0.7019, + "step": 4893, + "task_loss": 1.1920231580734253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6536885499954224, + "epoch": 4.14, + "learning_rate": 2.9315300084530856e-05, + "loss": 0.6259, + "step": 4894, + "task_loss": 0.6254956722259521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5598927736282349, + "epoch": 4.14, + "learning_rate": 2.9311073541842776e-05, + "loss": 0.5207, + "step": 4895, + "task_loss": 0.8743888139724731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8147443532943726, + "epoch": 4.14, + "learning_rate": 2.9306846999154692e-05, + "loss": 0.4724, + "step": 4896, + "task_loss": 0.9306342005729675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4392967224121094, + "epoch": 4.14, + "learning_rate": 2.9302620456466612e-05, + "loss": 0.5611, + "step": 4897, + "task_loss": 1.0989532470703125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.672174334526062, + "epoch": 4.14, + "learning_rate": 2.929839391377853e-05, + "loss": 0.7541, + "step": 4898, + "task_loss": 1.0243942737579346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3433629274368286, + "epoch": 4.14, + "learning_rate": 2.9294167371090448e-05, + "loss": 0.4786, + "step": 4899, + "task_loss": 0.07585060596466064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5659266710281372, + "epoch": 4.14, + "learning_rate": 2.9289940828402368e-05, + "loss": 0.5755, + "step": 4900, + "task_loss": 0.6905208230018616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6160291433334351, + "epoch": 4.14, + "learning_rate": 2.9285714285714288e-05, + "loss": 0.5173, + "step": 4901, + "task_loss": 0.7377527952194214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3513527512550354, + "epoch": 4.14, + "learning_rate": 2.9281487743026204e-05, + "loss": 0.4925, + "step": 4902, + "task_loss": 0.9227238297462463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3689699172973633, + "epoch": 4.14, + "learning_rate": 2.9277261200338124e-05, + "loss": 0.5694, + "step": 4903, + "task_loss": 0.502204954624176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3335379958152771, + "epoch": 4.15, + "learning_rate": 2.9273034657650043e-05, + "loss": 0.396, + "step": 4904, + "task_loss": 0.2030128389596939 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3724108040332794, + "epoch": 4.15, + "learning_rate": 2.9268808114961966e-05, + "loss": 0.3658, + "step": 4905, + "task_loss": 0.30825191736221313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6078504323959351, + "epoch": 4.15, + "learning_rate": 2.926458157227388e-05, + "loss": 0.8306, + "step": 4906, + "task_loss": 0.8077676296234131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4799591898918152, + "epoch": 4.15, + "learning_rate": 2.9260355029585803e-05, + "loss": 0.5028, + "step": 4907, + "task_loss": 0.646205484867096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6094024181365967, + "epoch": 4.15, + "learning_rate": 2.9256128486897722e-05, + "loss": 0.6137, + "step": 4908, + "task_loss": 0.3010289669036865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5203437209129333, + "epoch": 4.15, + "learning_rate": 2.9251901944209635e-05, + "loss": 0.6958, + "step": 4909, + "task_loss": 0.4879305362701416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5272976160049438, + "epoch": 4.15, + "learning_rate": 2.924767540152156e-05, + "loss": 0.5658, + "step": 4910, + "task_loss": 0.7172080278396606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.604246199131012, + "epoch": 4.15, + "learning_rate": 2.9243448858833478e-05, + "loss": 0.544, + "step": 4911, + "task_loss": 0.9679908752441406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.368024080991745, + "epoch": 4.15, + "learning_rate": 2.923922231614539e-05, + "loss": 0.464, + "step": 4912, + "task_loss": 0.7688194513320923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9912531971931458, + "epoch": 4.15, + "learning_rate": 2.9234995773457314e-05, + "loss": 0.7213, + "step": 4913, + "task_loss": 0.8605411052703857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7077438831329346, + "epoch": 4.15, + "learning_rate": 2.9230769230769234e-05, + "loss": 0.6176, + "step": 4914, + "task_loss": 0.9759804606437683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.266655296087265, + "epoch": 4.15, + "learning_rate": 2.922654268808115e-05, + "loss": 0.3885, + "step": 4915, + "task_loss": 0.04503697156906128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5361075401306152, + "epoch": 4.16, + "learning_rate": 2.922231614539307e-05, + "loss": 0.5723, + "step": 4916, + "task_loss": 0.4291464388370514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5263949036598206, + "epoch": 4.16, + "learning_rate": 2.921808960270499e-05, + "loss": 0.5451, + "step": 4917, + "task_loss": 0.5300502181053162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44350963830947876, + "epoch": 4.16, + "learning_rate": 2.9213863060016906e-05, + "loss": 0.605, + "step": 4918, + "task_loss": 0.7421781420707703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5925217866897583, + "epoch": 4.16, + "learning_rate": 2.9209636517328826e-05, + "loss": 0.6239, + "step": 4919, + "task_loss": 0.6273420453071594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46450141072273254, + "epoch": 4.16, + "learning_rate": 2.9205409974640746e-05, + "loss": 0.5079, + "step": 4920, + "task_loss": 0.5449296832084656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5474029183387756, + "epoch": 4.16, + "learning_rate": 2.9201183431952665e-05, + "loss": 0.6028, + "step": 4921, + "task_loss": 0.6845722198486328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3770174980163574, + "epoch": 4.16, + "learning_rate": 2.919695688926458e-05, + "loss": 0.5507, + "step": 4922, + "task_loss": 0.7765823602676392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4429907500743866, + "epoch": 4.16, + "learning_rate": 2.91927303465765e-05, + "loss": 0.5638, + "step": 4923, + "task_loss": 0.6939391493797302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44498199224472046, + "epoch": 4.16, + "learning_rate": 2.9188503803888425e-05, + "loss": 0.4144, + "step": 4924, + "task_loss": 0.47263678908348083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.643986165523529, + "epoch": 4.16, + "learning_rate": 2.9184277261200338e-05, + "loss": 0.4883, + "step": 4925, + "task_loss": 0.41211098432540894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29165083169937134, + "epoch": 4.16, + "learning_rate": 2.9180050718512257e-05, + "loss": 0.3935, + "step": 4926, + "task_loss": 0.3426480293273926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29551804065704346, + "epoch": 4.16, + "learning_rate": 2.917582417582418e-05, + "loss": 0.5643, + "step": 4927, + "task_loss": 0.3788049817085266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39327535033226013, + "epoch": 4.17, + "learning_rate": 2.9171597633136093e-05, + "loss": 0.4624, + "step": 4928, + "task_loss": 0.1327672153711319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.406957745552063, + "epoch": 4.17, + "learning_rate": 2.9167371090448013e-05, + "loss": 0.4637, + "step": 4929, + "task_loss": 0.651806652545929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5947275161743164, + "epoch": 4.17, + "learning_rate": 2.9163144547759936e-05, + "loss": 0.6036, + "step": 4930, + "task_loss": 0.6267604231834412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3165002763271332, + "epoch": 4.17, + "learning_rate": 2.915891800507185e-05, + "loss": 0.4925, + "step": 4931, + "task_loss": 1.1059143543243408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6140713691711426, + "epoch": 4.17, + "learning_rate": 2.9154691462383772e-05, + "loss": 0.6231, + "step": 4932, + "task_loss": 0.8499069213867188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3954935669898987, + "epoch": 4.17, + "learning_rate": 2.9150464919695692e-05, + "loss": 0.6254, + "step": 4933, + "task_loss": 0.9685243368148804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23880460858345032, + "epoch": 4.17, + "learning_rate": 2.9146238377007605e-05, + "loss": 0.716, + "step": 4934, + "task_loss": 0.049141138792037964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.657178521156311, + "epoch": 4.17, + "learning_rate": 2.9142011834319528e-05, + "loss": 0.5471, + "step": 4935, + "task_loss": 0.7862703800201416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16087676584720612, + "epoch": 4.17, + "learning_rate": 2.9137785291631448e-05, + "loss": 0.4434, + "step": 4936, + "task_loss": 0.021729743108153343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5415525436401367, + "epoch": 4.17, + "learning_rate": 2.9133558748943368e-05, + "loss": 0.5249, + "step": 4937, + "task_loss": 1.0101770162582397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37750259041786194, + "epoch": 4.17, + "learning_rate": 2.9129332206255284e-05, + "loss": 0.5025, + "step": 4938, + "task_loss": 0.8420785069465637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4695941209793091, + "epoch": 4.17, + "learning_rate": 2.9125105663567204e-05, + "loss": 0.6695, + "step": 4939, + "task_loss": 0.8085657358169556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.481032133102417, + "epoch": 4.18, + "learning_rate": 2.9120879120879123e-05, + "loss": 0.5884, + "step": 4940, + "task_loss": 1.667453646659851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5970499515533447, + "epoch": 4.18, + "learning_rate": 2.911665257819104e-05, + "loss": 0.4295, + "step": 4941, + "task_loss": 0.460021048784256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35698890686035156, + "epoch": 4.18, + "learning_rate": 2.911242603550296e-05, + "loss": 0.4166, + "step": 4942, + "task_loss": 0.7841063141822815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38521382212638855, + "epoch": 4.18, + "learning_rate": 2.910819949281488e-05, + "loss": 0.4549, + "step": 4943, + "task_loss": 0.7064574956893921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38757622241973877, + "epoch": 4.18, + "learning_rate": 2.9103972950126796e-05, + "loss": 0.5376, + "step": 4944, + "task_loss": 0.41958001255989075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5023869872093201, + "epoch": 4.18, + "learning_rate": 2.9099746407438715e-05, + "loss": 0.4174, + "step": 4945, + "task_loss": 0.9790821075439453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5780279040336609, + "epoch": 4.18, + "learning_rate": 2.9095519864750635e-05, + "loss": 0.5833, + "step": 4946, + "task_loss": 0.16638527810573578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5554252862930298, + "epoch": 4.18, + "learning_rate": 2.909129332206255e-05, + "loss": 0.5079, + "step": 4947, + "task_loss": 0.5973990559577942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5831525921821594, + "epoch": 4.18, + "learning_rate": 2.908706677937447e-05, + "loss": 0.4806, + "step": 4948, + "task_loss": 0.5166064500808716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6369892954826355, + "epoch": 4.18, + "learning_rate": 2.9082840236686394e-05, + "loss": 0.554, + "step": 4949, + "task_loss": 1.5615217685699463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.55471271276474, + "epoch": 4.18, + "learning_rate": 2.9078613693998314e-05, + "loss": 0.4924, + "step": 4950, + "task_loss": 0.8947004079818726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29818692803382874, + "epoch": 4.19, + "learning_rate": 2.9074387151310227e-05, + "loss": 0.4874, + "step": 4951, + "task_loss": 0.04880141094326973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5417750477790833, + "epoch": 4.19, + "learning_rate": 2.907016060862215e-05, + "loss": 0.444, + "step": 4952, + "task_loss": 0.8822089433670044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.523271381855011, + "epoch": 4.19, + "learning_rate": 2.906593406593407e-05, + "loss": 0.5113, + "step": 4953, + "task_loss": 1.0857772827148438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5274238586425781, + "epoch": 4.19, + "learning_rate": 2.9061707523245986e-05, + "loss": 0.5191, + "step": 4954, + "task_loss": 0.7283214330673218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7125116586685181, + "epoch": 4.19, + "learning_rate": 2.9057480980557906e-05, + "loss": 0.61, + "step": 4955, + "task_loss": 0.6378865838050842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2907133102416992, + "epoch": 4.19, + "learning_rate": 2.9053254437869826e-05, + "loss": 0.6648, + "step": 4956, + "task_loss": 0.9520096778869629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6296253204345703, + "epoch": 4.19, + "learning_rate": 2.9049027895181742e-05, + "loss": 0.5065, + "step": 4957, + "task_loss": 0.3367064595222473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3917253613471985, + "epoch": 4.19, + "learning_rate": 2.9044801352493662e-05, + "loss": 0.548, + "step": 4958, + "task_loss": 0.5287610292434692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5130088925361633, + "epoch": 4.19, + "learning_rate": 2.904057480980558e-05, + "loss": 0.5093, + "step": 4959, + "task_loss": 0.29144880175590515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6891018152236938, + "epoch": 4.19, + "learning_rate": 2.9036348267117498e-05, + "loss": 0.4839, + "step": 4960, + "task_loss": 0.2483626902103424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23162618279457092, + "epoch": 4.19, + "learning_rate": 2.9032121724429418e-05, + "loss": 0.4695, + "step": 4961, + "task_loss": 1.7705329656600952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5970432758331299, + "epoch": 4.19, + "learning_rate": 2.9027895181741337e-05, + "loss": 0.609, + "step": 4962, + "task_loss": 0.8749106526374817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4535556137561798, + "epoch": 4.2, + "learning_rate": 2.9023668639053254e-05, + "loss": 0.554, + "step": 4963, + "task_loss": 0.5557235479354858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28293246030807495, + "epoch": 4.2, + "learning_rate": 2.9019442096365173e-05, + "loss": 0.5694, + "step": 4964, + "task_loss": 0.5637669563293457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5207484364509583, + "epoch": 4.2, + "learning_rate": 2.9015215553677093e-05, + "loss": 0.5387, + "step": 4965, + "task_loss": 0.42689841985702515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3967847228050232, + "epoch": 4.2, + "learning_rate": 2.9010989010989016e-05, + "loss": 0.5142, + "step": 4966, + "task_loss": 0.8248480558395386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8131296634674072, + "epoch": 4.2, + "learning_rate": 2.900676246830093e-05, + "loss": 0.6822, + "step": 4967, + "task_loss": 0.9920462369918823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23411710560321808, + "epoch": 4.2, + "learning_rate": 2.900253592561285e-05, + "loss": 0.5174, + "step": 4968, + "task_loss": 0.4750393331050873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47619348764419556, + "epoch": 4.2, + "learning_rate": 2.8998309382924772e-05, + "loss": 0.5657, + "step": 4969, + "task_loss": 0.44457554817199707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0993400812149048, + "epoch": 4.2, + "learning_rate": 2.8994082840236685e-05, + "loss": 0.8597, + "step": 4970, + "task_loss": 1.513477087020874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5288493633270264, + "epoch": 4.2, + "learning_rate": 2.8989856297548608e-05, + "loss": 0.4473, + "step": 4971, + "task_loss": 0.5993368029594421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5062902569770813, + "epoch": 4.2, + "learning_rate": 2.8985629754860528e-05, + "loss": 0.5725, + "step": 4972, + "task_loss": 0.8104445934295654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7969005107879639, + "epoch": 4.2, + "learning_rate": 2.898140321217244e-05, + "loss": 0.6367, + "step": 4973, + "task_loss": 0.9035048484802246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7410600185394287, + "epoch": 4.2, + "learning_rate": 2.8977176669484364e-05, + "loss": 0.5603, + "step": 4974, + "task_loss": 0.8375561237335205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3379054069519043, + "epoch": 4.21, + "learning_rate": 2.8972950126796284e-05, + "loss": 0.4879, + "step": 4975, + "task_loss": 0.5272780656814575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38761106133461, + "epoch": 4.21, + "learning_rate": 2.8968723584108197e-05, + "loss": 0.5699, + "step": 4976, + "task_loss": 0.515591561794281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8695029020309448, + "epoch": 4.21, + "learning_rate": 2.896449704142012e-05, + "loss": 0.6183, + "step": 4977, + "task_loss": 1.400099515914917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4410107731819153, + "epoch": 4.21, + "learning_rate": 2.896027049873204e-05, + "loss": 0.5514, + "step": 4978, + "task_loss": 0.16848812997341156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5139065384864807, + "epoch": 4.21, + "learning_rate": 2.895604395604396e-05, + "loss": 0.4932, + "step": 4979, + "task_loss": 0.7612088322639465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3685995936393738, + "epoch": 4.21, + "learning_rate": 2.8951817413355876e-05, + "loss": 0.5016, + "step": 4980, + "task_loss": 0.49263593554496765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40824630856513977, + "epoch": 4.21, + "learning_rate": 2.8947590870667795e-05, + "loss": 0.7652, + "step": 4981, + "task_loss": 0.26346105337142944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4100915789604187, + "epoch": 4.21, + "learning_rate": 2.8943364327979715e-05, + "loss": 0.3495, + "step": 4982, + "task_loss": 1.1683602333068848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5771180391311646, + "epoch": 4.21, + "learning_rate": 2.893913778529163e-05, + "loss": 0.5585, + "step": 4983, + "task_loss": 0.6446143388748169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0801568031311035, + "epoch": 4.21, + "learning_rate": 2.893491124260355e-05, + "loss": 0.8557, + "step": 4984, + "task_loss": 1.6549850702285767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6279933452606201, + "epoch": 4.21, + "learning_rate": 2.893068469991547e-05, + "loss": 0.5336, + "step": 4985, + "task_loss": 0.9761338233947754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6012387275695801, + "epoch": 4.21, + "learning_rate": 2.8926458157227387e-05, + "loss": 0.7233, + "step": 4986, + "task_loss": 0.8339069485664368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4568895995616913, + "epoch": 4.22, + "learning_rate": 2.8922231614539307e-05, + "loss": 0.5408, + "step": 4987, + "task_loss": 1.250498652458191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.874041736125946, + "epoch": 4.22, + "learning_rate": 2.891800507185123e-05, + "loss": 0.5565, + "step": 4988, + "task_loss": 1.4179728031158447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49423927068710327, + "epoch": 4.22, + "learning_rate": 2.8913778529163143e-05, + "loss": 0.6198, + "step": 4989, + "task_loss": 0.7868390679359436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43859049677848816, + "epoch": 4.22, + "learning_rate": 2.8909551986475063e-05, + "loss": 0.5441, + "step": 4990, + "task_loss": 0.7789090871810913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2009529322385788, + "epoch": 4.22, + "learning_rate": 2.8905325443786986e-05, + "loss": 0.4136, + "step": 4991, + "task_loss": 0.06924349069595337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5552215576171875, + "epoch": 4.22, + "learning_rate": 2.89010989010989e-05, + "loss": 0.5511, + "step": 4992, + "task_loss": 0.6917145848274231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27759259939193726, + "epoch": 4.22, + "learning_rate": 2.889687235841082e-05, + "loss": 0.3879, + "step": 4993, + "task_loss": 0.2607293128967285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8054690957069397, + "epoch": 4.22, + "learning_rate": 2.8892645815722742e-05, + "loss": 0.503, + "step": 4994, + "task_loss": 1.4566730260849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48486143350601196, + "epoch": 4.22, + "learning_rate": 2.888841927303466e-05, + "loss": 0.5523, + "step": 4995, + "task_loss": 0.975770890712738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27125826478004456, + "epoch": 4.22, + "learning_rate": 2.8884192730346578e-05, + "loss": 0.553, + "step": 4996, + "task_loss": 0.5320665836334229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2912842333316803, + "epoch": 4.22, + "learning_rate": 2.8879966187658498e-05, + "loss": 0.5848, + "step": 4997, + "task_loss": 0.5996488332748413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3529224395751953, + "epoch": 4.22, + "learning_rate": 2.8875739644970417e-05, + "loss": 0.5161, + "step": 4998, + "task_loss": 0.22224129736423492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9461209774017334, + "epoch": 4.23, + "learning_rate": 2.8871513102282334e-05, + "loss": 0.5461, + "step": 4999, + "task_loss": 0.9298988580703735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.363048255443573, + "epoch": 4.23, + "learning_rate": 2.8867286559594254e-05, + "loss": 0.4928, + "step": 5000, + "task_loss": 0.7944290637969971 + }, + { + "epoch": 4.23, + "eval_accuracy": 0.9035643564356436, + "eval_loss": 0.3522135317325592, + "eval_runtime": 229.0744, + "eval_samples_per_second": 110.226, + "eval_steps_per_second": 0.864, + "step": 5000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5424635410308838, + "epoch": 4.23, + "learning_rate": 2.8863060016906173e-05, + "loss": 0.5581, + "step": 5001, + "task_loss": 0.7681723237037659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5082840323448181, + "epoch": 4.23, + "learning_rate": 2.885883347421809e-05, + "loss": 0.6123, + "step": 5002, + "task_loss": 0.31173062324523926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3281029462814331, + "epoch": 4.23, + "learning_rate": 2.885460693153001e-05, + "loss": 0.5049, + "step": 5003, + "task_loss": 0.37610554695129395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5271156430244446, + "epoch": 4.23, + "learning_rate": 2.885038038884193e-05, + "loss": 0.5752, + "step": 5004, + "task_loss": 0.5435252785682678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3205392062664032, + "epoch": 4.23, + "learning_rate": 2.8846153846153845e-05, + "loss": 0.4256, + "step": 5005, + "task_loss": 0.34773632884025574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.416623055934906, + "epoch": 4.23, + "learning_rate": 2.8841927303465765e-05, + "loss": 0.6099, + "step": 5006, + "task_loss": 0.2906308174133301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45569419860839844, + "epoch": 4.23, + "learning_rate": 2.8837700760777685e-05, + "loss": 0.4321, + "step": 5007, + "task_loss": 0.5803009867668152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37213602662086487, + "epoch": 4.23, + "learning_rate": 2.8833474218089608e-05, + "loss": 0.4653, + "step": 5008, + "task_loss": 0.07518910616636276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5695041418075562, + "epoch": 4.23, + "learning_rate": 2.882924767540152e-05, + "loss": 0.487, + "step": 5009, + "task_loss": 0.404824435710907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6915467381477356, + "epoch": 4.23, + "learning_rate": 2.882502113271344e-05, + "loss": 0.6883, + "step": 5010, + "task_loss": 0.7848670482635498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39997023344039917, + "epoch": 4.24, + "learning_rate": 2.8820794590025364e-05, + "loss": 0.4293, + "step": 5011, + "task_loss": 0.4946521818637848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4524044096469879, + "epoch": 4.24, + "learning_rate": 2.8816568047337277e-05, + "loss": 0.4829, + "step": 5012, + "task_loss": 0.7310373187065125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5082722902297974, + "epoch": 4.24, + "learning_rate": 2.88123415046492e-05, + "loss": 0.4349, + "step": 5013, + "task_loss": 0.5728713274002075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5411403179168701, + "epoch": 4.24, + "learning_rate": 2.880811496196112e-05, + "loss": 0.64, + "step": 5014, + "task_loss": 0.595592200756073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5737406611442566, + "epoch": 4.24, + "learning_rate": 2.8803888419273033e-05, + "loss": 0.5593, + "step": 5015, + "task_loss": 0.3804052472114563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3536869287490845, + "epoch": 4.24, + "learning_rate": 2.8799661876584956e-05, + "loss": 0.5477, + "step": 5016, + "task_loss": 0.5726757049560547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35529932379722595, + "epoch": 4.24, + "learning_rate": 2.8795435333896876e-05, + "loss": 0.4997, + "step": 5017, + "task_loss": 0.9764136075973511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7777359485626221, + "epoch": 4.24, + "learning_rate": 2.8791208791208792e-05, + "loss": 0.5859, + "step": 5018, + "task_loss": 0.6851890087127686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.544541597366333, + "epoch": 4.24, + "learning_rate": 2.878698224852071e-05, + "loss": 0.536, + "step": 5019, + "task_loss": 0.9854859709739685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5944560766220093, + "epoch": 4.24, + "learning_rate": 2.878275570583263e-05, + "loss": 0.4635, + "step": 5020, + "task_loss": 0.2814624011516571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35691067576408386, + "epoch": 4.24, + "learning_rate": 2.8778529163144548e-05, + "loss": 0.4022, + "step": 5021, + "task_loss": 0.2276492565870285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33211642503738403, + "epoch": 4.24, + "learning_rate": 2.8774302620456467e-05, + "loss": 0.4664, + "step": 5022, + "task_loss": 1.2398892641067505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5359298586845398, + "epoch": 4.25, + "learning_rate": 2.8770076077768387e-05, + "loss": 0.3557, + "step": 5023, + "task_loss": 0.5677745342254639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.598246693611145, + "epoch": 4.25, + "learning_rate": 2.8765849535080307e-05, + "loss": 0.6645, + "step": 5024, + "task_loss": 0.45430052280426025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5481970906257629, + "epoch": 4.25, + "learning_rate": 2.8761622992392223e-05, + "loss": 0.6308, + "step": 5025, + "task_loss": 1.0280213356018066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5252529382705688, + "epoch": 4.25, + "learning_rate": 2.8757396449704143e-05, + "loss": 0.5848, + "step": 5026, + "task_loss": 0.3855023980140686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4001982808113098, + "epoch": 4.25, + "learning_rate": 2.8753169907016063e-05, + "loss": 0.5481, + "step": 5027, + "task_loss": 0.2155093550682068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5002865195274353, + "epoch": 4.25, + "learning_rate": 2.874894336432798e-05, + "loss": 0.5, + "step": 5028, + "task_loss": 0.6530987620353699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7264885902404785, + "epoch": 4.25, + "learning_rate": 2.87447168216399e-05, + "loss": 0.5128, + "step": 5029, + "task_loss": 1.5708767175674438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7406023740768433, + "epoch": 4.25, + "learning_rate": 2.8740490278951822e-05, + "loss": 0.5352, + "step": 5030, + "task_loss": 0.8087359070777893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5345301628112793, + "epoch": 4.25, + "learning_rate": 2.8736263736263735e-05, + "loss": 0.5976, + "step": 5031, + "task_loss": 1.0272800922393799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4186236262321472, + "epoch": 4.25, + "learning_rate": 2.8732037193575655e-05, + "loss": 0.5201, + "step": 5032, + "task_loss": 0.10075681656599045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5343166589736938, + "epoch": 4.25, + "learning_rate": 2.8727810650887578e-05, + "loss": 0.4477, + "step": 5033, + "task_loss": 0.7180424928665161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6384636163711548, + "epoch": 4.26, + "learning_rate": 2.872358410819949e-05, + "loss": 0.5919, + "step": 5034, + "task_loss": 0.8868371844291687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4453398287296295, + "epoch": 4.26, + "learning_rate": 2.8719357565511414e-05, + "loss": 0.541, + "step": 5035, + "task_loss": 0.7494087219238281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7095698118209839, + "epoch": 4.26, + "learning_rate": 2.8715131022823334e-05, + "loss": 0.4759, + "step": 5036, + "task_loss": 1.0688354969024658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9623632431030273, + "epoch": 4.26, + "learning_rate": 2.8710904480135253e-05, + "loss": 0.8396, + "step": 5037, + "task_loss": 0.34394949674606323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6313179135322571, + "epoch": 4.26, + "learning_rate": 2.870667793744717e-05, + "loss": 0.5707, + "step": 5038, + "task_loss": 0.4269314408302307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6322230100631714, + "epoch": 4.26, + "learning_rate": 2.870245139475909e-05, + "loss": 0.6423, + "step": 5039, + "task_loss": 0.7876618504524231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5772720575332642, + "epoch": 4.26, + "learning_rate": 2.869822485207101e-05, + "loss": 0.4847, + "step": 5040, + "task_loss": 0.2415463775396347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0083447694778442, + "epoch": 4.26, + "learning_rate": 2.8693998309382926e-05, + "loss": 0.523, + "step": 5041, + "task_loss": 0.7287866473197937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35244181752204895, + "epoch": 4.26, + "learning_rate": 2.8689771766694845e-05, + "loss": 0.4214, + "step": 5042, + "task_loss": 0.5554124116897583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2813867926597595, + "epoch": 4.26, + "learning_rate": 2.8685545224006765e-05, + "loss": 0.4834, + "step": 5043, + "task_loss": 0.31542858481407166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9355268478393555, + "epoch": 4.26, + "learning_rate": 2.868131868131868e-05, + "loss": 0.6104, + "step": 5044, + "task_loss": 0.2900858223438263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5138185620307922, + "epoch": 4.26, + "learning_rate": 2.86770921386306e-05, + "loss": 0.4551, + "step": 5045, + "task_loss": 0.3275415003299713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5506541132926941, + "epoch": 4.27, + "learning_rate": 2.867286559594252e-05, + "loss": 0.4276, + "step": 5046, + "task_loss": 0.8579879999160767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3394980728626251, + "epoch": 4.27, + "learning_rate": 2.8668639053254437e-05, + "loss": 0.6012, + "step": 5047, + "task_loss": 0.4949684739112854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6559955477714539, + "epoch": 4.27, + "learning_rate": 2.8664412510566357e-05, + "loss": 0.703, + "step": 5048, + "task_loss": 0.6371138691902161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7527486085891724, + "epoch": 4.27, + "learning_rate": 2.8660185967878277e-05, + "loss": 0.6325, + "step": 5049, + "task_loss": 0.6538408398628235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3338960111141205, + "epoch": 4.27, + "learning_rate": 2.8655959425190193e-05, + "loss": 0.3403, + "step": 5050, + "task_loss": 0.719497561454773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41151633858680725, + "epoch": 4.27, + "learning_rate": 2.8651732882502113e-05, + "loss": 0.611, + "step": 5051, + "task_loss": 0.6953022480010986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49113377928733826, + "epoch": 4.27, + "learning_rate": 2.8647506339814033e-05, + "loss": 0.5068, + "step": 5052, + "task_loss": 0.5454524755477905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8189623355865479, + "epoch": 4.27, + "learning_rate": 2.8643279797125956e-05, + "loss": 0.6825, + "step": 5053, + "task_loss": 1.085884928703308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5768108367919922, + "epoch": 4.27, + "learning_rate": 2.863905325443787e-05, + "loss": 0.4658, + "step": 5054, + "task_loss": 1.254048228263855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6696983575820923, + "epoch": 4.27, + "learning_rate": 2.8634826711749792e-05, + "loss": 0.5947, + "step": 5055, + "task_loss": 0.454205185174942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9175926446914673, + "epoch": 4.27, + "learning_rate": 2.863060016906171e-05, + "loss": 0.5407, + "step": 5056, + "task_loss": 1.322234869003296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9288214445114136, + "epoch": 4.27, + "learning_rate": 2.8626373626373624e-05, + "loss": 0.5564, + "step": 5057, + "task_loss": 1.1883230209350586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.310880184173584, + "epoch": 4.28, + "learning_rate": 2.8622147083685548e-05, + "loss": 0.5495, + "step": 5058, + "task_loss": 0.13048161566257477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8108692169189453, + "epoch": 4.28, + "learning_rate": 2.8617920540997467e-05, + "loss": 0.7225, + "step": 5059, + "task_loss": 0.5182772874832153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0437372922897339, + "epoch": 4.28, + "learning_rate": 2.8613693998309384e-05, + "loss": 0.8408, + "step": 5060, + "task_loss": 1.1800932884216309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4520098567008972, + "epoch": 4.28, + "learning_rate": 2.8609467455621303e-05, + "loss": 0.581, + "step": 5061, + "task_loss": 0.5899325609207153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29349690675735474, + "epoch": 4.28, + "learning_rate": 2.8605240912933223e-05, + "loss": 0.5258, + "step": 5062, + "task_loss": 0.9250083565711975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5980914831161499, + "epoch": 4.28, + "learning_rate": 2.860101437024514e-05, + "loss": 0.6895, + "step": 5063, + "task_loss": 0.852351188659668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43609872460365295, + "epoch": 4.28, + "learning_rate": 2.859678782755706e-05, + "loss": 0.6501, + "step": 5064, + "task_loss": 0.5329897403717041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.601274847984314, + "epoch": 4.28, + "learning_rate": 2.859256128486898e-05, + "loss": 0.5103, + "step": 5065, + "task_loss": 0.4264865517616272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.501763105392456, + "epoch": 4.28, + "learning_rate": 2.85883347421809e-05, + "loss": 0.4882, + "step": 5066, + "task_loss": 1.025362253189087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4265732169151306, + "epoch": 4.28, + "learning_rate": 2.8584108199492815e-05, + "loss": 0.5224, + "step": 5067, + "task_loss": 0.1147022619843483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48253557085990906, + "epoch": 4.28, + "learning_rate": 2.8579881656804735e-05, + "loss": 0.606, + "step": 5068, + "task_loss": 1.0819717645645142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8059843182563782, + "epoch": 4.28, + "learning_rate": 2.8575655114116655e-05, + "loss": 0.5613, + "step": 5069, + "task_loss": 0.7770708799362183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7644687294960022, + "epoch": 4.29, + "learning_rate": 2.857142857142857e-05, + "loss": 0.6374, + "step": 5070, + "task_loss": 1.2870728969573975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39863288402557373, + "epoch": 4.29, + "learning_rate": 2.856720202874049e-05, + "loss": 0.439, + "step": 5071, + "task_loss": 0.08547637611627579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3423424959182739, + "epoch": 4.29, + "learning_rate": 2.8562975486052414e-05, + "loss": 0.4383, + "step": 5072, + "task_loss": 0.4734783172607422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.740888774394989, + "epoch": 4.29, + "learning_rate": 2.8558748943364327e-05, + "loss": 0.808, + "step": 5073, + "task_loss": 0.8702861666679382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8916035890579224, + "epoch": 4.29, + "learning_rate": 2.8554522400676246e-05, + "loss": 0.8385, + "step": 5074, + "task_loss": 0.7356163859367371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49436667561531067, + "epoch": 4.29, + "learning_rate": 2.855029585798817e-05, + "loss": 0.7151, + "step": 5075, + "task_loss": 0.9598007798194885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3796423673629761, + "epoch": 4.29, + "learning_rate": 2.8546069315300083e-05, + "loss": 0.5543, + "step": 5076, + "task_loss": 0.17260704934597015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6580232977867126, + "epoch": 4.29, + "learning_rate": 2.8541842772612006e-05, + "loss": 0.6424, + "step": 5077, + "task_loss": 0.4667375087738037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4417484402656555, + "epoch": 4.29, + "learning_rate": 2.8537616229923925e-05, + "loss": 0.4634, + "step": 5078, + "task_loss": 0.511508047580719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7601735591888428, + "epoch": 4.29, + "learning_rate": 2.853338968723584e-05, + "loss": 0.5502, + "step": 5079, + "task_loss": 0.7513731122016907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3145574927330017, + "epoch": 4.29, + "learning_rate": 2.852916314454776e-05, + "loss": 0.4365, + "step": 5080, + "task_loss": 0.38452383875846863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5880226492881775, + "epoch": 4.29, + "learning_rate": 2.852493660185968e-05, + "loss": 0.4497, + "step": 5081, + "task_loss": 1.0433785915374756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4004688262939453, + "epoch": 4.3, + "learning_rate": 2.85207100591716e-05, + "loss": 0.5593, + "step": 5082, + "task_loss": 0.4971187710762024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43154376745224, + "epoch": 4.3, + "learning_rate": 2.8516483516483517e-05, + "loss": 0.5792, + "step": 5083, + "task_loss": 0.7321638464927673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38204774260520935, + "epoch": 4.3, + "learning_rate": 2.8512256973795437e-05, + "loss": 0.5214, + "step": 5084, + "task_loss": 0.6594197154045105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27718475461006165, + "epoch": 4.3, + "learning_rate": 2.8508030431107357e-05, + "loss": 0.4359, + "step": 5085, + "task_loss": 0.03347624093294144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.82254958152771, + "epoch": 4.3, + "learning_rate": 2.8503803888419273e-05, + "loss": 0.5682, + "step": 5086, + "task_loss": 0.59958416223526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6809030175209045, + "epoch": 4.3, + "learning_rate": 2.8499577345731193e-05, + "loss": 0.5768, + "step": 5087, + "task_loss": 1.0686261653900146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3533551096916199, + "epoch": 4.3, + "learning_rate": 2.8495350803043113e-05, + "loss": 0.4384, + "step": 5088, + "task_loss": 0.8124313950538635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6623342633247375, + "epoch": 4.3, + "learning_rate": 2.849112426035503e-05, + "loss": 0.5361, + "step": 5089, + "task_loss": 0.6109859943389893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3484307527542114, + "epoch": 4.3, + "learning_rate": 2.848689771766695e-05, + "loss": 0.4501, + "step": 5090, + "task_loss": 0.48259633779525757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43353691697120667, + "epoch": 4.3, + "learning_rate": 2.848267117497887e-05, + "loss": 0.5263, + "step": 5091, + "task_loss": 0.579473614692688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49614328145980835, + "epoch": 4.3, + "learning_rate": 2.8478444632290785e-05, + "loss": 0.4894, + "step": 5092, + "task_loss": 0.628524899482727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5502766370773315, + "epoch": 4.3, + "learning_rate": 2.8474218089602705e-05, + "loss": 0.5102, + "step": 5093, + "task_loss": 1.0286712646484375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.662949800491333, + "epoch": 4.31, + "learning_rate": 2.8469991546914628e-05, + "loss": 0.5032, + "step": 5094, + "task_loss": 0.5948931574821472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3951289653778076, + "epoch": 4.31, + "learning_rate": 2.8465765004226547e-05, + "loss": 0.5088, + "step": 5095, + "task_loss": 0.4640941023826599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.357793390750885, + "epoch": 4.31, + "learning_rate": 2.846153846153846e-05, + "loss": 0.6069, + "step": 5096, + "task_loss": 1.0247195959091187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37486934661865234, + "epoch": 4.31, + "learning_rate": 2.8457311918850383e-05, + "loss": 0.3958, + "step": 5097, + "task_loss": 0.11131393909454346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5342576503753662, + "epoch": 4.31, + "learning_rate": 2.8453085376162303e-05, + "loss": 0.6418, + "step": 5098, + "task_loss": 0.7262894511222839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5514885187149048, + "epoch": 4.31, + "learning_rate": 2.844885883347422e-05, + "loss": 0.6562, + "step": 5099, + "task_loss": 1.1110492944717407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40893006324768066, + "epoch": 4.31, + "learning_rate": 2.844463229078614e-05, + "loss": 0.4178, + "step": 5100, + "task_loss": 0.717585563659668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5973784327507019, + "epoch": 4.31, + "learning_rate": 2.844040574809806e-05, + "loss": 0.572, + "step": 5101, + "task_loss": 1.197821021080017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6897644996643066, + "epoch": 4.31, + "learning_rate": 2.8436179205409975e-05, + "loss": 0.4778, + "step": 5102, + "task_loss": 0.3001370131969452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5698708295822144, + "epoch": 4.31, + "learning_rate": 2.8431952662721895e-05, + "loss": 0.6569, + "step": 5103, + "task_loss": 1.7258988618850708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3880610764026642, + "epoch": 4.31, + "learning_rate": 2.8427726120033815e-05, + "loss": 0.5722, + "step": 5104, + "task_loss": 0.6297459006309509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4154919981956482, + "epoch": 4.32, + "learning_rate": 2.842349957734573e-05, + "loss": 0.4683, + "step": 5105, + "task_loss": 0.7200974822044373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6565115451812744, + "epoch": 4.32, + "learning_rate": 2.841927303465765e-05, + "loss": 0.4753, + "step": 5106, + "task_loss": 1.7250747680664062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5814658999443054, + "epoch": 4.32, + "learning_rate": 2.841504649196957e-05, + "loss": 0.5159, + "step": 5107, + "task_loss": 0.7972137928009033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27703380584716797, + "epoch": 4.32, + "learning_rate": 2.8410819949281487e-05, + "loss": 0.4695, + "step": 5108, + "task_loss": 0.8439851999282837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6899734139442444, + "epoch": 4.32, + "learning_rate": 2.8406593406593407e-05, + "loss": 0.5645, + "step": 5109, + "task_loss": 0.6597148776054382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0467429161071777, + "epoch": 4.32, + "learning_rate": 2.8402366863905327e-05, + "loss": 0.6363, + "step": 5110, + "task_loss": 0.989573061466217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5966413021087646, + "epoch": 4.32, + "learning_rate": 2.839814032121725e-05, + "loss": 0.5452, + "step": 5111, + "task_loss": 1.4620239734649658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8440982103347778, + "epoch": 4.32, + "learning_rate": 2.8393913778529163e-05, + "loss": 0.5259, + "step": 5112, + "task_loss": 0.7884932160377502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6587424278259277, + "epoch": 4.32, + "learning_rate": 2.8389687235841082e-05, + "loss": 0.542, + "step": 5113, + "task_loss": 1.0504063367843628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8028830885887146, + "epoch": 4.32, + "learning_rate": 2.8385460693153005e-05, + "loss": 0.5945, + "step": 5114, + "task_loss": 0.6810760498046875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.584349513053894, + "epoch": 4.32, + "learning_rate": 2.838123415046492e-05, + "loss": 0.5065, + "step": 5115, + "task_loss": 0.28613045811653137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33873969316482544, + "epoch": 4.32, + "learning_rate": 2.8377007607776838e-05, + "loss": 0.5025, + "step": 5116, + "task_loss": 0.44696739315986633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3172038197517395, + "epoch": 4.33, + "learning_rate": 2.837278106508876e-05, + "loss": 0.3665, + "step": 5117, + "task_loss": 0.30419179797172546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3495364785194397, + "epoch": 4.33, + "learning_rate": 2.8368554522400674e-05, + "loss": 0.4374, + "step": 5118, + "task_loss": 0.423458993434906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7016594409942627, + "epoch": 4.33, + "learning_rate": 2.8364327979712597e-05, + "loss": 0.6444, + "step": 5119, + "task_loss": 0.25258922576904297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5477955341339111, + "epoch": 4.33, + "learning_rate": 2.8360101437024517e-05, + "loss": 0.5276, + "step": 5120, + "task_loss": 0.6664541959762573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44633913040161133, + "epoch": 4.33, + "learning_rate": 2.835587489433643e-05, + "loss": 0.4672, + "step": 5121, + "task_loss": 0.6767635941505432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6306418776512146, + "epoch": 4.33, + "learning_rate": 2.8351648351648353e-05, + "loss": 0.5734, + "step": 5122, + "task_loss": 1.5109035968780518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8460204601287842, + "epoch": 4.33, + "learning_rate": 2.8347421808960273e-05, + "loss": 0.5884, + "step": 5123, + "task_loss": 0.9197711944580078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5007159113883972, + "epoch": 4.33, + "learning_rate": 2.8343195266272193e-05, + "loss": 0.5058, + "step": 5124, + "task_loss": 0.49399128556251526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6091967821121216, + "epoch": 4.33, + "learning_rate": 2.833896872358411e-05, + "loss": 0.4897, + "step": 5125, + "task_loss": 0.7462937235832214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42689502239227295, + "epoch": 4.33, + "learning_rate": 2.833474218089603e-05, + "loss": 0.47, + "step": 5126, + "task_loss": 1.3030518293380737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4012297987937927, + "epoch": 4.33, + "learning_rate": 2.833051563820795e-05, + "loss": 0.6271, + "step": 5127, + "task_loss": 0.8749964833259583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5435868501663208, + "epoch": 4.33, + "learning_rate": 2.8326289095519865e-05, + "loss": 0.5572, + "step": 5128, + "task_loss": 1.041831135749817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1026819944381714, + "epoch": 4.34, + "learning_rate": 2.8322062552831785e-05, + "loss": 0.7004, + "step": 5129, + "task_loss": 0.6742920875549316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5178220272064209, + "epoch": 4.34, + "learning_rate": 2.8317836010143704e-05, + "loss": 0.5686, + "step": 5130, + "task_loss": 0.737801730632782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2614779472351074, + "epoch": 4.34, + "learning_rate": 2.831360946745562e-05, + "loss": 0.454, + "step": 5131, + "task_loss": 0.785963237285614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6100143194198608, + "epoch": 4.34, + "learning_rate": 2.830938292476754e-05, + "loss": 0.5511, + "step": 5132, + "task_loss": 1.0341075658798218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25425082445144653, + "epoch": 4.34, + "learning_rate": 2.830515638207946e-05, + "loss": 0.4488, + "step": 5133, + "task_loss": 0.658420979976654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48537927865982056, + "epoch": 4.34, + "learning_rate": 2.8300929839391377e-05, + "loss": 0.6504, + "step": 5134, + "task_loss": 0.6704983115196228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37334245443344116, + "epoch": 4.34, + "learning_rate": 2.8296703296703296e-05, + "loss": 0.5203, + "step": 5135, + "task_loss": 0.3471969664096832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.300645649433136, + "epoch": 4.34, + "learning_rate": 2.829247675401522e-05, + "loss": 0.3204, + "step": 5136, + "task_loss": 0.061320386826992035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3374484181404114, + "epoch": 4.34, + "learning_rate": 2.8288250211327132e-05, + "loss": 0.4713, + "step": 5137, + "task_loss": 0.4944455027580261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5325794816017151, + "epoch": 4.34, + "learning_rate": 2.8284023668639052e-05, + "loss": 0.506, + "step": 5138, + "task_loss": 0.6164921522140503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29305213689804077, + "epoch": 4.34, + "learning_rate": 2.8279797125950975e-05, + "loss": 0.5322, + "step": 5139, + "task_loss": 0.2341928333044052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41257110238075256, + "epoch": 4.34, + "learning_rate": 2.8275570583262895e-05, + "loss": 0.4207, + "step": 5140, + "task_loss": 0.22187206149101257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5428314805030823, + "epoch": 4.35, + "learning_rate": 2.827134404057481e-05, + "loss": 0.5645, + "step": 5141, + "task_loss": 0.457040935754776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38220298290252686, + "epoch": 4.35, + "learning_rate": 2.826711749788673e-05, + "loss": 0.4694, + "step": 5142, + "task_loss": 0.7440987825393677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2930101752281189, + "epoch": 4.35, + "learning_rate": 2.826289095519865e-05, + "loss": 0.3929, + "step": 5143, + "task_loss": 0.4625621438026428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3664424419403076, + "epoch": 4.35, + "learning_rate": 2.8258664412510567e-05, + "loss": 0.4099, + "step": 5144, + "task_loss": 0.12081511318683624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.378319650888443, + "epoch": 4.35, + "learning_rate": 2.8254437869822487e-05, + "loss": 0.4599, + "step": 5145, + "task_loss": 0.8400179147720337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.004839301109314, + "epoch": 4.35, + "learning_rate": 2.8250211327134407e-05, + "loss": 0.577, + "step": 5146, + "task_loss": 0.9192570447921753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2919743061065674, + "epoch": 4.35, + "learning_rate": 2.8245984784446323e-05, + "loss": 0.4254, + "step": 5147, + "task_loss": 0.27241548895835876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3417454659938812, + "epoch": 4.35, + "learning_rate": 2.8241758241758243e-05, + "loss": 0.589, + "step": 5148, + "task_loss": 0.3698180019855499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6555811762809753, + "epoch": 4.35, + "learning_rate": 2.8237531699070162e-05, + "loss": 0.6556, + "step": 5149, + "task_loss": 1.003419041633606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4510529041290283, + "epoch": 4.35, + "learning_rate": 2.823330515638208e-05, + "loss": 0.5821, + "step": 5150, + "task_loss": 0.924353301525116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5813985466957092, + "epoch": 4.35, + "learning_rate": 2.8229078613694e-05, + "loss": 0.4638, + "step": 5151, + "task_loss": 0.7580537796020508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7579513788223267, + "epoch": 4.35, + "learning_rate": 2.8224852071005918e-05, + "loss": 0.6081, + "step": 5152, + "task_loss": 1.168509840965271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6822163462638855, + "epoch": 4.36, + "learning_rate": 2.822062552831784e-05, + "loss": 0.5068, + "step": 5153, + "task_loss": 0.9986529350280762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42662423849105835, + "epoch": 4.36, + "learning_rate": 2.8216398985629754e-05, + "loss": 0.4762, + "step": 5154, + "task_loss": 0.916824221611023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.895922064781189, + "epoch": 4.36, + "learning_rate": 2.8212172442941674e-05, + "loss": 0.6484, + "step": 5155, + "task_loss": 0.5292621850967407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25648730993270874, + "epoch": 4.36, + "learning_rate": 2.8207945900253597e-05, + "loss": 0.6055, + "step": 5156, + "task_loss": 0.9580972194671631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.676588237285614, + "epoch": 4.36, + "learning_rate": 2.820371935756551e-05, + "loss": 0.6144, + "step": 5157, + "task_loss": 1.0475866794586182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28063124418258667, + "epoch": 4.36, + "learning_rate": 2.8199492814877433e-05, + "loss": 0.4578, + "step": 5158, + "task_loss": 0.45988017320632935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45851457118988037, + "epoch": 4.36, + "learning_rate": 2.8195266272189353e-05, + "loss": 0.4849, + "step": 5159, + "task_loss": 0.4729257822036743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49214595556259155, + "epoch": 4.36, + "learning_rate": 2.8191039729501266e-05, + "loss": 0.6618, + "step": 5160, + "task_loss": 0.5544630289077759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6935632228851318, + "epoch": 4.36, + "learning_rate": 2.818681318681319e-05, + "loss": 0.634, + "step": 5161, + "task_loss": 0.6638074517250061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30302244424819946, + "epoch": 4.36, + "learning_rate": 2.818258664412511e-05, + "loss": 0.4642, + "step": 5162, + "task_loss": 0.034671150147914886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2157883197069168, + "epoch": 4.36, + "learning_rate": 2.8178360101437022e-05, + "loss": 0.5834, + "step": 5163, + "task_loss": 0.12452114373445511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7114389538764954, + "epoch": 4.36, + "learning_rate": 2.8174133558748945e-05, + "loss": 0.7532, + "step": 5164, + "task_loss": 0.911513090133667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6030280590057373, + "epoch": 4.37, + "learning_rate": 2.8169907016060865e-05, + "loss": 0.6124, + "step": 5165, + "task_loss": 0.8780170679092407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5960670709609985, + "epoch": 4.37, + "learning_rate": 2.816568047337278e-05, + "loss": 0.5157, + "step": 5166, + "task_loss": 1.470974087715149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6917482018470764, + "epoch": 4.37, + "learning_rate": 2.81614539306847e-05, + "loss": 0.7225, + "step": 5167, + "task_loss": 1.1586589813232422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31945890188217163, + "epoch": 4.37, + "learning_rate": 2.815722738799662e-05, + "loss": 0.4557, + "step": 5168, + "task_loss": 0.4502628445625305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44876235723495483, + "epoch": 4.37, + "learning_rate": 2.815300084530854e-05, + "loss": 0.353, + "step": 5169, + "task_loss": 0.5005380511283875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6036719083786011, + "epoch": 4.37, + "learning_rate": 2.8148774302620457e-05, + "loss": 0.4186, + "step": 5170, + "task_loss": 0.5820677280426025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6830223798751831, + "epoch": 4.37, + "learning_rate": 2.8144547759932376e-05, + "loss": 0.7023, + "step": 5171, + "task_loss": 1.3801106214523315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.549087405204773, + "epoch": 4.37, + "learning_rate": 2.8140321217244296e-05, + "loss": 0.4464, + "step": 5172, + "task_loss": 0.7122361660003662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8788726329803467, + "epoch": 4.37, + "learning_rate": 2.8136094674556212e-05, + "loss": 0.5827, + "step": 5173, + "task_loss": 0.529780924320221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3800903856754303, + "epoch": 4.37, + "learning_rate": 2.8131868131868132e-05, + "loss": 0.4638, + "step": 5174, + "task_loss": 0.19307269155979156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7118173837661743, + "epoch": 4.37, + "learning_rate": 2.8127641589180055e-05, + "loss": 0.5074, + "step": 5175, + "task_loss": 1.2091130018234253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5326511859893799, + "epoch": 4.38, + "learning_rate": 2.8123415046491968e-05, + "loss": 0.4148, + "step": 5176, + "task_loss": 1.1868786811828613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8008134365081787, + "epoch": 4.38, + "learning_rate": 2.8119188503803888e-05, + "loss": 0.6732, + "step": 5177, + "task_loss": 0.4487271308898926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9325487613677979, + "epoch": 4.38, + "learning_rate": 2.811496196111581e-05, + "loss": 0.5885, + "step": 5178, + "task_loss": 0.8965787887573242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3975347876548767, + "epoch": 4.38, + "learning_rate": 2.8110735418427724e-05, + "loss": 0.4624, + "step": 5179, + "task_loss": 0.4935968816280365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3524544835090637, + "epoch": 4.38, + "learning_rate": 2.8106508875739644e-05, + "loss": 0.4805, + "step": 5180, + "task_loss": 0.6915262341499329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5573157668113708, + "epoch": 4.38, + "learning_rate": 2.8102282333051567e-05, + "loss": 0.5414, + "step": 5181, + "task_loss": 0.3662712275981903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4952223300933838, + "epoch": 4.38, + "learning_rate": 2.8098055790363487e-05, + "loss": 0.6287, + "step": 5182, + "task_loss": 1.260284185409546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7111854553222656, + "epoch": 4.38, + "learning_rate": 2.8093829247675403e-05, + "loss": 0.6655, + "step": 5183, + "task_loss": 1.1439898014068604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9234825372695923, + "epoch": 4.38, + "learning_rate": 2.8089602704987323e-05, + "loss": 0.7595, + "step": 5184, + "task_loss": 1.217672348022461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4007294178009033, + "epoch": 4.38, + "learning_rate": 2.8085376162299243e-05, + "loss": 0.4409, + "step": 5185, + "task_loss": 0.34053730964660645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7860308885574341, + "epoch": 4.38, + "learning_rate": 2.808114961961116e-05, + "loss": 0.4376, + "step": 5186, + "task_loss": 0.5460903644561768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3694063723087311, + "epoch": 4.38, + "learning_rate": 2.807692307692308e-05, + "loss": 0.4302, + "step": 5187, + "task_loss": 0.81557297706604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38470977544784546, + "epoch": 4.39, + "learning_rate": 2.8072696534235e-05, + "loss": 0.4431, + "step": 5188, + "task_loss": 0.4834229648113251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5069899559020996, + "epoch": 4.39, + "learning_rate": 2.8068469991546915e-05, + "loss": 0.4367, + "step": 5189, + "task_loss": 0.0913282111287117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6348316669464111, + "epoch": 4.39, + "learning_rate": 2.8064243448858834e-05, + "loss": 0.5962, + "step": 5190, + "task_loss": 0.70924311876297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3900063633918762, + "epoch": 4.39, + "learning_rate": 2.8060016906170754e-05, + "loss": 0.3692, + "step": 5191, + "task_loss": 0.291110634803772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5717165470123291, + "epoch": 4.39, + "learning_rate": 2.805579036348267e-05, + "loss": 0.5706, + "step": 5192, + "task_loss": 0.9611908197402954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5871396064758301, + "epoch": 4.39, + "learning_rate": 2.805156382079459e-05, + "loss": 0.4889, + "step": 5193, + "task_loss": 0.9330399036407471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4547578692436218, + "epoch": 4.39, + "learning_rate": 2.804733727810651e-05, + "loss": 0.5616, + "step": 5194, + "task_loss": 0.7625985145568848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2721131443977356, + "epoch": 4.39, + "learning_rate": 2.8043110735418426e-05, + "loss": 0.442, + "step": 5195, + "task_loss": 0.1250012218952179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.747524619102478, + "epoch": 4.39, + "learning_rate": 2.8038884192730346e-05, + "loss": 0.6382, + "step": 5196, + "task_loss": 0.3939744234085083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6001373529434204, + "epoch": 4.39, + "learning_rate": 2.8034657650042266e-05, + "loss": 0.4933, + "step": 5197, + "task_loss": 0.6505735516548157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7749235033988953, + "epoch": 4.39, + "learning_rate": 2.803043110735419e-05, + "loss": 0.7562, + "step": 5198, + "task_loss": 1.0037953853607178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4544030725955963, + "epoch": 4.39, + "learning_rate": 2.8026204564666102e-05, + "loss": 0.5222, + "step": 5199, + "task_loss": 1.7628355026245117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4277747571468353, + "epoch": 4.4, + "learning_rate": 2.8021978021978025e-05, + "loss": 0.4876, + "step": 5200, + "task_loss": 0.40000858902931213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36930519342422485, + "epoch": 4.4, + "learning_rate": 2.8017751479289945e-05, + "loss": 0.6174, + "step": 5201, + "task_loss": 1.259460687637329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5621569156646729, + "epoch": 4.4, + "learning_rate": 2.8013524936601858e-05, + "loss": 0.5693, + "step": 5202, + "task_loss": 0.687770664691925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44103580713272095, + "epoch": 4.4, + "learning_rate": 2.800929839391378e-05, + "loss": 0.4953, + "step": 5203, + "task_loss": 1.1542632579803467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5126107931137085, + "epoch": 4.4, + "learning_rate": 2.80050718512257e-05, + "loss": 0.6372, + "step": 5204, + "task_loss": 0.7302573323249817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.457723468542099, + "epoch": 4.4, + "learning_rate": 2.8000845308537617e-05, + "loss": 0.4851, + "step": 5205, + "task_loss": 0.6507857441902161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44336998462677, + "epoch": 4.4, + "learning_rate": 2.7996618765849537e-05, + "loss": 0.5516, + "step": 5206, + "task_loss": 0.3427440822124481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5211141109466553, + "epoch": 4.4, + "learning_rate": 2.7992392223161456e-05, + "loss": 0.585, + "step": 5207, + "task_loss": 0.43093645572662354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.62564617395401, + "epoch": 4.4, + "learning_rate": 2.7988165680473373e-05, + "loss": 0.4695, + "step": 5208, + "task_loss": 1.0945377349853516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8042891025543213, + "epoch": 4.4, + "learning_rate": 2.7983939137785293e-05, + "loss": 0.6085, + "step": 5209, + "task_loss": 0.5824911594390869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5441581010818481, + "epoch": 4.4, + "learning_rate": 2.7979712595097212e-05, + "loss": 0.5682, + "step": 5210, + "task_loss": 0.9451205730438232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3167878985404968, + "epoch": 4.4, + "learning_rate": 2.7975486052409132e-05, + "loss": 0.4606, + "step": 5211, + "task_loss": 0.356450617313385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6397414207458496, + "epoch": 4.41, + "learning_rate": 2.797125950972105e-05, + "loss": 0.5088, + "step": 5212, + "task_loss": 1.4975876808166504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29197466373443604, + "epoch": 4.41, + "learning_rate": 2.7967032967032968e-05, + "loss": 0.4009, + "step": 5213, + "task_loss": 0.5427900552749634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46088218688964844, + "epoch": 4.41, + "learning_rate": 2.7962806424344888e-05, + "loss": 0.4312, + "step": 5214, + "task_loss": 0.38193684816360474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6076503396034241, + "epoch": 4.41, + "learning_rate": 2.7958579881656804e-05, + "loss": 0.5127, + "step": 5215, + "task_loss": 0.8249279260635376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.467257559299469, + "epoch": 4.41, + "learning_rate": 2.7954353338968724e-05, + "loss": 0.5268, + "step": 5216, + "task_loss": 0.9494364261627197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9039942026138306, + "epoch": 4.41, + "learning_rate": 2.7950126796280647e-05, + "loss": 0.6162, + "step": 5217, + "task_loss": 2.245042085647583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.616887092590332, + "epoch": 4.41, + "learning_rate": 2.794590025359256e-05, + "loss": 0.5327, + "step": 5218, + "task_loss": 0.709876298904419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6708094477653503, + "epoch": 4.41, + "learning_rate": 2.794167371090448e-05, + "loss": 0.5921, + "step": 5219, + "task_loss": 0.3885692059993744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4749789237976074, + "epoch": 4.41, + "learning_rate": 2.7937447168216403e-05, + "loss": 0.5761, + "step": 5220, + "task_loss": 0.6657482981681824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4687151312828064, + "epoch": 4.41, + "learning_rate": 2.7933220625528316e-05, + "loss": 0.4517, + "step": 5221, + "task_loss": 1.0021352767944336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4453340768814087, + "epoch": 4.41, + "learning_rate": 2.792899408284024e-05, + "loss": 0.4861, + "step": 5222, + "task_loss": 0.3758726119995117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32083624601364136, + "epoch": 4.41, + "learning_rate": 2.792476754015216e-05, + "loss": 0.6274, + "step": 5223, + "task_loss": 0.9310301542282104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46359506249427795, + "epoch": 4.42, + "learning_rate": 2.792054099746407e-05, + "loss": 0.5768, + "step": 5224, + "task_loss": 0.8322440981864929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6924983859062195, + "epoch": 4.42, + "learning_rate": 2.7916314454775995e-05, + "loss": 0.4488, + "step": 5225, + "task_loss": 0.1947295069694519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5955932140350342, + "epoch": 4.42, + "learning_rate": 2.7912087912087915e-05, + "loss": 0.5219, + "step": 5226, + "task_loss": 0.7560649514198303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4445628821849823, + "epoch": 4.42, + "learning_rate": 2.7907861369399834e-05, + "loss": 0.4714, + "step": 5227, + "task_loss": 0.8212294578552246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44620150327682495, + "epoch": 4.42, + "learning_rate": 2.790363482671175e-05, + "loss": 0.4739, + "step": 5228, + "task_loss": 0.582520067691803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33446359634399414, + "epoch": 4.42, + "learning_rate": 2.789940828402367e-05, + "loss": 0.4245, + "step": 5229, + "task_loss": 0.40171048045158386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6825902462005615, + "epoch": 4.42, + "learning_rate": 2.789518174133559e-05, + "loss": 0.5452, + "step": 5230, + "task_loss": 0.5209140777587891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4893573820590973, + "epoch": 4.42, + "learning_rate": 2.7890955198647506e-05, + "loss": 0.6725, + "step": 5231, + "task_loss": 0.9621965289115906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25065481662750244, + "epoch": 4.42, + "learning_rate": 2.7886728655959426e-05, + "loss": 0.4579, + "step": 5232, + "task_loss": 0.46341046690940857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48631200194358826, + "epoch": 4.42, + "learning_rate": 2.7882502113271346e-05, + "loss": 0.5992, + "step": 5233, + "task_loss": 1.2627606391906738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5348921418190002, + "epoch": 4.42, + "learning_rate": 2.7878275570583262e-05, + "loss": 0.58, + "step": 5234, + "task_loss": 0.4680885672569275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5864924788475037, + "epoch": 4.42, + "learning_rate": 2.7874049027895182e-05, + "loss": 0.6615, + "step": 5235, + "task_loss": 1.0830177068710327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4064631164073944, + "epoch": 4.43, + "learning_rate": 2.7869822485207102e-05, + "loss": 0.7178, + "step": 5236, + "task_loss": 1.1240168809890747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39533764123916626, + "epoch": 4.43, + "learning_rate": 2.7865595942519018e-05, + "loss": 0.4301, + "step": 5237, + "task_loss": 0.17977426946163177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4698025584220886, + "epoch": 4.43, + "learning_rate": 2.7861369399830938e-05, + "loss": 0.546, + "step": 5238, + "task_loss": 0.7148550152778625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7171174883842468, + "epoch": 4.43, + "learning_rate": 2.785714285714286e-05, + "loss": 0.5485, + "step": 5239, + "task_loss": 0.7230456471443176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4743465185165405, + "epoch": 4.43, + "learning_rate": 2.785291631445478e-05, + "loss": 0.477, + "step": 5240, + "task_loss": 0.6340458989143372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6087259650230408, + "epoch": 4.43, + "learning_rate": 2.7848689771766694e-05, + "loss": 0.5505, + "step": 5241, + "task_loss": 0.8480997681617737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9905765652656555, + "epoch": 4.43, + "learning_rate": 2.7844463229078617e-05, + "loss": 0.6051, + "step": 5242, + "task_loss": 0.7984235286712646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35354849696159363, + "epoch": 4.43, + "learning_rate": 2.7840236686390537e-05, + "loss": 0.6114, + "step": 5243, + "task_loss": 0.1469717025756836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.547687292098999, + "epoch": 4.43, + "learning_rate": 2.783601014370245e-05, + "loss": 0.5184, + "step": 5244, + "task_loss": 1.2660895586013794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.717499852180481, + "epoch": 4.43, + "learning_rate": 2.7831783601014373e-05, + "loss": 0.6051, + "step": 5245, + "task_loss": 0.6356750726699829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4846644997596741, + "epoch": 4.43, + "learning_rate": 2.7827557058326292e-05, + "loss": 0.4907, + "step": 5246, + "task_loss": 0.9908105731010437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.59178227186203, + "epoch": 4.44, + "learning_rate": 2.782333051563821e-05, + "loss": 0.604, + "step": 5247, + "task_loss": 0.43177640438079834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6043274998664856, + "epoch": 4.44, + "learning_rate": 2.781910397295013e-05, + "loss": 0.6397, + "step": 5248, + "task_loss": 0.39989933371543884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.667040228843689, + "epoch": 4.44, + "learning_rate": 2.7814877430262048e-05, + "loss": 0.6386, + "step": 5249, + "task_loss": 1.2049132585525513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8686316609382629, + "epoch": 4.44, + "learning_rate": 2.7810650887573965e-05, + "loss": 0.7103, + "step": 5250, + "task_loss": 0.2922057807445526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8328902721405029, + "epoch": 4.44, + "learning_rate": 2.7806424344885884e-05, + "loss": 0.5476, + "step": 5251, + "task_loss": 0.49473509192466736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32229751348495483, + "epoch": 4.44, + "learning_rate": 2.7802197802197804e-05, + "loss": 0.4756, + "step": 5252, + "task_loss": 1.0162307024002075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5075170993804932, + "epoch": 4.44, + "learning_rate": 2.779797125950972e-05, + "loss": 0.5952, + "step": 5253, + "task_loss": 0.9417028427124023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48733970522880554, + "epoch": 4.44, + "learning_rate": 2.779374471682164e-05, + "loss": 0.5232, + "step": 5254, + "task_loss": 0.5937368869781494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6276860237121582, + "epoch": 4.44, + "learning_rate": 2.778951817413356e-05, + "loss": 0.4717, + "step": 5255, + "task_loss": 1.359994649887085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2065477967262268, + "epoch": 4.44, + "learning_rate": 2.7785291631445483e-05, + "loss": 0.4319, + "step": 5256, + "task_loss": 0.3590351939201355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3995498716831207, + "epoch": 4.44, + "learning_rate": 2.7781065088757396e-05, + "loss": 0.4323, + "step": 5257, + "task_loss": 0.4106886684894562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33497872948646545, + "epoch": 4.44, + "learning_rate": 2.7776838546069316e-05, + "loss": 0.3968, + "step": 5258, + "task_loss": 0.6700830459594727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45254287123680115, + "epoch": 4.45, + "learning_rate": 2.777261200338124e-05, + "loss": 0.5677, + "step": 5259, + "task_loss": 0.5990794897079468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8443390130996704, + "epoch": 4.45, + "learning_rate": 2.7768385460693152e-05, + "loss": 0.5287, + "step": 5260, + "task_loss": 0.4160047173500061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8077751994132996, + "epoch": 4.45, + "learning_rate": 2.776415891800507e-05, + "loss": 0.5666, + "step": 5261, + "task_loss": 0.8264917135238647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5909587144851685, + "epoch": 4.45, + "learning_rate": 2.7759932375316995e-05, + "loss": 0.6071, + "step": 5262, + "task_loss": 0.17001473903656006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21351748704910278, + "epoch": 4.45, + "learning_rate": 2.7755705832628908e-05, + "loss": 0.4412, + "step": 5263, + "task_loss": 0.4138297736644745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27882158756256104, + "epoch": 4.45, + "learning_rate": 2.775147928994083e-05, + "loss": 0.4527, + "step": 5264, + "task_loss": 0.19578656554222107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9294151067733765, + "epoch": 4.45, + "learning_rate": 2.774725274725275e-05, + "loss": 0.5838, + "step": 5265, + "task_loss": 0.5137839913368225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8721062541007996, + "epoch": 4.45, + "learning_rate": 2.7743026204564663e-05, + "loss": 0.745, + "step": 5266, + "task_loss": 1.0089004039764404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7218141555786133, + "epoch": 4.45, + "learning_rate": 2.7738799661876587e-05, + "loss": 0.5768, + "step": 5267, + "task_loss": 1.8011058568954468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5719237327575684, + "epoch": 4.45, + "learning_rate": 2.7734573119188506e-05, + "loss": 0.4491, + "step": 5268, + "task_loss": 0.2796790897846222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.443045973777771, + "epoch": 4.45, + "learning_rate": 2.7730346576500426e-05, + "loss": 0.4433, + "step": 5269, + "task_loss": 0.3776971101760864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5205014944076538, + "epoch": 4.45, + "learning_rate": 2.7726120033812342e-05, + "loss": 0.5577, + "step": 5270, + "task_loss": 1.1317591667175293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4009777307510376, + "epoch": 4.46, + "learning_rate": 2.7721893491124262e-05, + "loss": 0.4991, + "step": 5271, + "task_loss": 0.39043381810188293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4715713560581207, + "epoch": 4.46, + "learning_rate": 2.7717666948436182e-05, + "loss": 0.4602, + "step": 5272, + "task_loss": 0.5938316583633423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8763389587402344, + "epoch": 4.46, + "learning_rate": 2.7713440405748098e-05, + "loss": 0.5428, + "step": 5273, + "task_loss": 0.6430569887161255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48041558265686035, + "epoch": 4.46, + "learning_rate": 2.7709213863060018e-05, + "loss": 0.5199, + "step": 5274, + "task_loss": 0.4133508801460266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3968963623046875, + "epoch": 4.46, + "learning_rate": 2.7704987320371938e-05, + "loss": 0.4196, + "step": 5275, + "task_loss": 0.39113008975982666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5474100112915039, + "epoch": 4.46, + "learning_rate": 2.7700760777683854e-05, + "loss": 0.7283, + "step": 5276, + "task_loss": 1.3075909614562988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6922458410263062, + "epoch": 4.46, + "learning_rate": 2.7696534234995774e-05, + "loss": 0.4851, + "step": 5277, + "task_loss": 0.9054099917411804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29515084624290466, + "epoch": 4.46, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.4854, + "step": 5278, + "task_loss": 0.2939417362213135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5836045145988464, + "epoch": 4.46, + "learning_rate": 2.768808114961961e-05, + "loss": 0.5121, + "step": 5279, + "task_loss": 0.8165667057037354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43753018975257874, + "epoch": 4.46, + "learning_rate": 2.768385460693153e-05, + "loss": 0.5125, + "step": 5280, + "task_loss": 1.2097663879394531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42646661400794983, + "epoch": 4.46, + "learning_rate": 2.7679628064243453e-05, + "loss": 0.5244, + "step": 5281, + "task_loss": 0.4319024085998535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5711855292320251, + "epoch": 4.46, + "learning_rate": 2.7675401521555366e-05, + "loss": 0.4711, + "step": 5282, + "task_loss": 1.2479275465011597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4802483916282654, + "epoch": 4.47, + "learning_rate": 2.7671174978867285e-05, + "loss": 0.5311, + "step": 5283, + "task_loss": 0.720216691493988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4605392813682556, + "epoch": 4.47, + "learning_rate": 2.766694843617921e-05, + "loss": 0.5406, + "step": 5284, + "task_loss": 0.6428165435791016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.451355904340744, + "epoch": 4.47, + "learning_rate": 2.766272189349113e-05, + "loss": 0.5133, + "step": 5285, + "task_loss": 0.3209630846977234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5163174867630005, + "epoch": 4.47, + "learning_rate": 2.7658495350803045e-05, + "loss": 0.5538, + "step": 5286, + "task_loss": 0.5297533273696899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40374863147735596, + "epoch": 4.47, + "learning_rate": 2.7654268808114964e-05, + "loss": 0.5162, + "step": 5287, + "task_loss": 0.6043429374694824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33275893330574036, + "epoch": 4.47, + "learning_rate": 2.7650042265426884e-05, + "loss": 0.5044, + "step": 5288, + "task_loss": 0.7751222848892212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46711021661758423, + "epoch": 4.47, + "learning_rate": 2.76458157227388e-05, + "loss": 0.5104, + "step": 5289, + "task_loss": 0.5457462668418884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8899959921836853, + "epoch": 4.47, + "learning_rate": 2.764158918005072e-05, + "loss": 0.6966, + "step": 5290, + "task_loss": 0.5391844511032104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6285883784294128, + "epoch": 4.47, + "learning_rate": 2.763736263736264e-05, + "loss": 0.5784, + "step": 5291, + "task_loss": 0.7773985862731934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5362526774406433, + "epoch": 4.47, + "learning_rate": 2.7633136094674556e-05, + "loss": 0.5492, + "step": 5292, + "task_loss": 0.3517150580883026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30984261631965637, + "epoch": 4.47, + "learning_rate": 2.7628909551986476e-05, + "loss": 0.4233, + "step": 5293, + "task_loss": 0.30853402614593506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33329862356185913, + "epoch": 4.47, + "learning_rate": 2.7624683009298396e-05, + "loss": 0.4847, + "step": 5294, + "task_loss": 0.32293060421943665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25270822644233704, + "epoch": 4.48, + "learning_rate": 2.7620456466610312e-05, + "loss": 0.4156, + "step": 5295, + "task_loss": 0.6066348552703857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34133803844451904, + "epoch": 4.48, + "learning_rate": 2.7616229923922232e-05, + "loss": 0.4313, + "step": 5296, + "task_loss": 1.1871721744537354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5936399102210999, + "epoch": 4.48, + "learning_rate": 2.761200338123415e-05, + "loss": 0.4867, + "step": 5297, + "task_loss": 0.6456088423728943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7116214632987976, + "epoch": 4.48, + "learning_rate": 2.7607776838546075e-05, + "loss": 0.8233, + "step": 5298, + "task_loss": 1.1166901588439941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8979636430740356, + "epoch": 4.48, + "learning_rate": 2.7603550295857988e-05, + "loss": 0.6894, + "step": 5299, + "task_loss": 0.752702534198761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2484954297542572, + "epoch": 4.48, + "learning_rate": 2.7599323753169907e-05, + "loss": 0.3845, + "step": 5300, + "task_loss": 0.35879749059677124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5948817729949951, + "epoch": 4.48, + "learning_rate": 2.759509721048183e-05, + "loss": 0.6222, + "step": 5301, + "task_loss": 2.029428482055664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7001022100448608, + "epoch": 4.48, + "learning_rate": 2.7590870667793744e-05, + "loss": 0.602, + "step": 5302, + "task_loss": 0.42063450813293457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47895729541778564, + "epoch": 4.48, + "learning_rate": 2.7586644125105667e-05, + "loss": 0.4279, + "step": 5303, + "task_loss": 1.356555700302124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9620168209075928, + "epoch": 4.48, + "learning_rate": 2.7582417582417586e-05, + "loss": 0.9009, + "step": 5304, + "task_loss": 1.9546177387237549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42540958523750305, + "epoch": 4.48, + "learning_rate": 2.75781910397295e-05, + "loss": 0.6444, + "step": 5305, + "task_loss": 0.9007807970046997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4218650162220001, + "epoch": 4.48, + "learning_rate": 2.7573964497041422e-05, + "loss": 0.4138, + "step": 5306, + "task_loss": 0.2950070798397064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5379204750061035, + "epoch": 4.49, + "learning_rate": 2.7569737954353342e-05, + "loss": 0.5827, + "step": 5307, + "task_loss": 0.5207168459892273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6428512930870056, + "epoch": 4.49, + "learning_rate": 2.7565511411665255e-05, + "loss": 0.7337, + "step": 5308, + "task_loss": 0.8318116664886475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7405051589012146, + "epoch": 4.49, + "learning_rate": 2.756128486897718e-05, + "loss": 0.4726, + "step": 5309, + "task_loss": 1.6033854484558105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3466702103614807, + "epoch": 4.49, + "learning_rate": 2.7557058326289098e-05, + "loss": 0.5209, + "step": 5310, + "task_loss": 1.1743545532226562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37898385524749756, + "epoch": 4.49, + "learning_rate": 2.7552831783601014e-05, + "loss": 0.5949, + "step": 5311, + "task_loss": 0.7249733805656433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36499926447868347, + "epoch": 4.49, + "learning_rate": 2.7548605240912934e-05, + "loss": 0.497, + "step": 5312, + "task_loss": 0.38812413811683655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.256285697221756, + "epoch": 4.49, + "learning_rate": 2.7544378698224854e-05, + "loss": 0.4844, + "step": 5313, + "task_loss": 0.4137265384197235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7692514657974243, + "epoch": 4.49, + "learning_rate": 2.7540152155536774e-05, + "loss": 0.5162, + "step": 5314, + "task_loss": 1.0254405736923218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3826620876789093, + "epoch": 4.49, + "learning_rate": 2.753592561284869e-05, + "loss": 0.6084, + "step": 5315, + "task_loss": 0.1097988486289978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48812592029571533, + "epoch": 4.49, + "learning_rate": 2.753169907016061e-05, + "loss": 0.491, + "step": 5316, + "task_loss": 0.5517425537109375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32362592220306396, + "epoch": 4.49, + "learning_rate": 2.752747252747253e-05, + "loss": 0.5018, + "step": 5317, + "task_loss": 0.23633944988250732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5260745286941528, + "epoch": 4.5, + "learning_rate": 2.7523245984784446e-05, + "loss": 0.5458, + "step": 5318, + "task_loss": 0.2933235764503479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6433345079421997, + "epoch": 4.5, + "learning_rate": 2.7519019442096366e-05, + "loss": 0.5014, + "step": 5319, + "task_loss": 0.7560244202613831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.53708815574646, + "epoch": 4.5, + "learning_rate": 2.751479289940829e-05, + "loss": 0.5582, + "step": 5320, + "task_loss": 0.914886474609375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49981334805488586, + "epoch": 4.5, + "learning_rate": 2.75105663567202e-05, + "loss": 0.5058, + "step": 5321, + "task_loss": 0.5407476425170898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4667494297027588, + "epoch": 4.5, + "learning_rate": 2.750633981403212e-05, + "loss": 0.4379, + "step": 5322, + "task_loss": 0.37113475799560547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41166138648986816, + "epoch": 4.5, + "learning_rate": 2.7502113271344044e-05, + "loss": 0.4221, + "step": 5323, + "task_loss": 0.23486945033073425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7390798330307007, + "epoch": 4.5, + "learning_rate": 2.7497886728655957e-05, + "loss": 0.5183, + "step": 5324, + "task_loss": 0.24238334596157074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6582691073417664, + "epoch": 4.5, + "learning_rate": 2.7493660185967877e-05, + "loss": 0.5836, + "step": 5325, + "task_loss": 0.4756070375442505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3439198136329651, + "epoch": 4.5, + "learning_rate": 2.74894336432798e-05, + "loss": 0.663, + "step": 5326, + "task_loss": 0.4897440969944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7269107699394226, + "epoch": 4.5, + "learning_rate": 2.748520710059172e-05, + "loss": 0.4789, + "step": 5327, + "task_loss": 0.5211256146430969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5550800561904907, + "epoch": 4.5, + "learning_rate": 2.7480980557903636e-05, + "loss": 0.4562, + "step": 5328, + "task_loss": 0.6577306389808655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43927112221717834, + "epoch": 4.5, + "learning_rate": 2.7476754015215556e-05, + "loss": 0.4786, + "step": 5329, + "task_loss": 0.743955671787262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20793700218200684, + "epoch": 4.51, + "learning_rate": 2.7472527472527476e-05, + "loss": 0.371, + "step": 5330, + "task_loss": 0.0683882087469101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5816153287887573, + "epoch": 4.51, + "learning_rate": 2.7468300929839392e-05, + "loss": 0.5147, + "step": 5331, + "task_loss": 0.9620684385299683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36975929141044617, + "epoch": 4.51, + "learning_rate": 2.7464074387151312e-05, + "loss": 0.4648, + "step": 5332, + "task_loss": 0.9288278818130493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40742093324661255, + "epoch": 4.51, + "learning_rate": 2.7459847844463232e-05, + "loss": 0.5185, + "step": 5333, + "task_loss": 0.4179134964942932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5236296653747559, + "epoch": 4.51, + "learning_rate": 2.7455621301775148e-05, + "loss": 0.741, + "step": 5334, + "task_loss": 1.2751891613006592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6133717894554138, + "epoch": 4.51, + "learning_rate": 2.7451394759087068e-05, + "loss": 0.5611, + "step": 5335, + "task_loss": 1.061793565750122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5870427489280701, + "epoch": 4.51, + "learning_rate": 2.7447168216398988e-05, + "loss": 0.4618, + "step": 5336, + "task_loss": 0.757529616355896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35291415452957153, + "epoch": 4.51, + "learning_rate": 2.7442941673710904e-05, + "loss": 0.4923, + "step": 5337, + "task_loss": 0.08009390532970428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6521784067153931, + "epoch": 4.51, + "learning_rate": 2.7438715131022824e-05, + "loss": 0.736, + "step": 5338, + "task_loss": 1.6338598728179932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46560460329055786, + "epoch": 4.51, + "learning_rate": 2.7434488588334743e-05, + "loss": 0.3905, + "step": 5339, + "task_loss": 0.8386006951332092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5951083302497864, + "epoch": 4.51, + "learning_rate": 2.743026204564666e-05, + "loss": 0.5619, + "step": 5340, + "task_loss": 1.3118579387664795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4544222354888916, + "epoch": 4.51, + "learning_rate": 2.742603550295858e-05, + "loss": 0.5228, + "step": 5341, + "task_loss": 0.3645521402359009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4155057668685913, + "epoch": 4.52, + "learning_rate": 2.74218089602705e-05, + "loss": 0.5417, + "step": 5342, + "task_loss": 0.1335374116897583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37532559037208557, + "epoch": 4.52, + "learning_rate": 2.7417582417582422e-05, + "loss": 0.6281, + "step": 5343, + "task_loss": 0.048613499850034714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.755224347114563, + "epoch": 4.52, + "learning_rate": 2.7413355874894335e-05, + "loss": 0.6334, + "step": 5344, + "task_loss": 0.8768181204795837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39834776520729065, + "epoch": 4.52, + "learning_rate": 2.740912933220626e-05, + "loss": 0.4996, + "step": 5345, + "task_loss": 0.4868217706680298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26929712295532227, + "epoch": 4.52, + "learning_rate": 2.7404902789518178e-05, + "loss": 0.6149, + "step": 5346, + "task_loss": 0.2461363524198532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6296822428703308, + "epoch": 4.52, + "learning_rate": 2.740067624683009e-05, + "loss": 0.6245, + "step": 5347, + "task_loss": 1.5669865608215332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3427096903324127, + "epoch": 4.52, + "learning_rate": 2.7396449704142014e-05, + "loss": 0.4173, + "step": 5348, + "task_loss": 0.5462246537208557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3833007514476776, + "epoch": 4.52, + "learning_rate": 2.7392223161453934e-05, + "loss": 0.5529, + "step": 5349, + "task_loss": 0.740264356136322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48672664165496826, + "epoch": 4.52, + "learning_rate": 2.738799661876585e-05, + "loss": 0.5163, + "step": 5350, + "task_loss": 1.0009360313415527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4030136466026306, + "epoch": 4.52, + "learning_rate": 2.738377007607777e-05, + "loss": 0.5861, + "step": 5351, + "task_loss": 0.3999110460281372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2966085374355316, + "epoch": 4.52, + "learning_rate": 2.737954353338969e-05, + "loss": 0.5322, + "step": 5352, + "task_loss": 0.07671620696783066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7703042030334473, + "epoch": 4.52, + "learning_rate": 2.7375316990701606e-05, + "loss": 0.5746, + "step": 5353, + "task_loss": 0.9699040651321411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5411189794540405, + "epoch": 4.53, + "learning_rate": 2.7371090448013526e-05, + "loss": 0.4717, + "step": 5354, + "task_loss": 1.0553245544433594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4826720356941223, + "epoch": 4.53, + "learning_rate": 2.7366863905325446e-05, + "loss": 0.5296, + "step": 5355, + "task_loss": 0.699677050113678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9032210111618042, + "epoch": 4.53, + "learning_rate": 2.7362637362637365e-05, + "loss": 0.5969, + "step": 5356, + "task_loss": 1.0561261177062988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43842482566833496, + "epoch": 4.53, + "learning_rate": 2.7358410819949282e-05, + "loss": 0.4893, + "step": 5357, + "task_loss": 0.14813998341560364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7673283219337463, + "epoch": 4.53, + "learning_rate": 2.73541842772612e-05, + "loss": 0.606, + "step": 5358, + "task_loss": 0.9109212160110474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7322139739990234, + "epoch": 4.53, + "learning_rate": 2.734995773457312e-05, + "loss": 0.5907, + "step": 5359, + "task_loss": 1.4554837942123413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.861885666847229, + "epoch": 4.53, + "learning_rate": 2.7345731191885038e-05, + "loss": 0.6039, + "step": 5360, + "task_loss": 1.3273420333862305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40028080344200134, + "epoch": 4.53, + "learning_rate": 2.7341504649196957e-05, + "loss": 0.5511, + "step": 5361, + "task_loss": 0.21868599951267242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7488853931427002, + "epoch": 4.53, + "learning_rate": 2.733727810650888e-05, + "loss": 0.7021, + "step": 5362, + "task_loss": 0.641872227191925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6559021472930908, + "epoch": 4.53, + "learning_rate": 2.7333051563820793e-05, + "loss": 0.6907, + "step": 5363, + "task_loss": 1.3550121784210205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48939910531044006, + "epoch": 4.53, + "learning_rate": 2.7328825021132713e-05, + "loss": 0.5435, + "step": 5364, + "task_loss": 1.0295660495758057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5626007914543152, + "epoch": 4.53, + "learning_rate": 2.7324598478444636e-05, + "loss": 0.4786, + "step": 5365, + "task_loss": 0.550451934337616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5613609552383423, + "epoch": 4.54, + "learning_rate": 2.732037193575655e-05, + "loss": 0.4789, + "step": 5366, + "task_loss": 0.808321475982666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4478108286857605, + "epoch": 4.54, + "learning_rate": 2.7316145393068472e-05, + "loss": 0.4906, + "step": 5367, + "task_loss": 0.33435213565826416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5399828553199768, + "epoch": 4.54, + "learning_rate": 2.7311918850380392e-05, + "loss": 0.4696, + "step": 5368, + "task_loss": 0.4496751129627228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41021180152893066, + "epoch": 4.54, + "learning_rate": 2.7307692307692305e-05, + "loss": 0.6109, + "step": 5369, + "task_loss": 0.4057617485523224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3225499391555786, + "epoch": 4.54, + "learning_rate": 2.7303465765004228e-05, + "loss": 0.4364, + "step": 5370, + "task_loss": 0.618878185749054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4486207365989685, + "epoch": 4.54, + "learning_rate": 2.7299239222316148e-05, + "loss": 0.4617, + "step": 5371, + "task_loss": 0.4366832971572876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2765759825706482, + "epoch": 4.54, + "learning_rate": 2.7295012679628068e-05, + "loss": 0.3803, + "step": 5372, + "task_loss": 0.42383480072021484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6151210069656372, + "epoch": 4.54, + "learning_rate": 2.7290786136939984e-05, + "loss": 0.4874, + "step": 5373, + "task_loss": 1.1104873418807983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4537014663219452, + "epoch": 4.54, + "learning_rate": 2.7286559594251904e-05, + "loss": 0.4734, + "step": 5374, + "task_loss": 0.4669799506664276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6336425542831421, + "epoch": 4.54, + "learning_rate": 2.7282333051563823e-05, + "loss": 0.6792, + "step": 5375, + "task_loss": 0.23321525752544403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41254305839538574, + "epoch": 4.54, + "learning_rate": 2.727810650887574e-05, + "loss": 0.5366, + "step": 5376, + "task_loss": 0.2227546125650406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5170974135398865, + "epoch": 4.54, + "learning_rate": 2.727387996618766e-05, + "loss": 0.4602, + "step": 5377, + "task_loss": 0.4968531131744385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7210485935211182, + "epoch": 4.55, + "learning_rate": 2.726965342349958e-05, + "loss": 0.444, + "step": 5378, + "task_loss": 0.5338968634605408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6378473043441772, + "epoch": 4.55, + "learning_rate": 2.7265426880811496e-05, + "loss": 0.6686, + "step": 5379, + "task_loss": 0.381631463766098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4934467077255249, + "epoch": 4.55, + "learning_rate": 2.7261200338123415e-05, + "loss": 0.4727, + "step": 5380, + "task_loss": 0.5844487547874451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7077147960662842, + "epoch": 4.55, + "learning_rate": 2.7256973795435335e-05, + "loss": 0.6066, + "step": 5381, + "task_loss": 0.6631092429161072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4296506643295288, + "epoch": 4.55, + "learning_rate": 2.725274725274725e-05, + "loss": 0.4946, + "step": 5382, + "task_loss": 0.6394067406654358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.596145510673523, + "epoch": 4.55, + "learning_rate": 2.724852071005917e-05, + "loss": 0.5587, + "step": 5383, + "task_loss": 0.9702220559120178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5406750440597534, + "epoch": 4.55, + "learning_rate": 2.7244294167371094e-05, + "loss": 0.5037, + "step": 5384, + "task_loss": 0.48072123527526855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49670472741127014, + "epoch": 4.55, + "learning_rate": 2.7240067624683014e-05, + "loss": 0.5095, + "step": 5385, + "task_loss": 1.3811817169189453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3292773962020874, + "epoch": 4.55, + "learning_rate": 2.7235841081994927e-05, + "loss": 0.5871, + "step": 5386, + "task_loss": 1.177852988243103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33358633518218994, + "epoch": 4.55, + "learning_rate": 2.723161453930685e-05, + "loss": 0.7863, + "step": 5387, + "task_loss": 1.5302704572677612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3668261170387268, + "epoch": 4.55, + "learning_rate": 2.722738799661877e-05, + "loss": 0.6719, + "step": 5388, + "task_loss": 0.8525644540786743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7261632680892944, + "epoch": 4.56, + "learning_rate": 2.7223161453930683e-05, + "loss": 0.5054, + "step": 5389, + "task_loss": 1.2605348825454712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9297285079956055, + "epoch": 4.56, + "learning_rate": 2.7218934911242606e-05, + "loss": 0.5816, + "step": 5390, + "task_loss": 2.27435040473938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7317273616790771, + "epoch": 4.56, + "learning_rate": 2.7214708368554526e-05, + "loss": 0.5447, + "step": 5391, + "task_loss": 0.6920271515846252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4560929238796234, + "epoch": 4.56, + "learning_rate": 2.7210481825866442e-05, + "loss": 0.4907, + "step": 5392, + "task_loss": 0.4183073043823242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2572377026081085, + "epoch": 4.56, + "learning_rate": 2.7206255283178362e-05, + "loss": 0.5336, + "step": 5393, + "task_loss": 1.0202858448028564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6760679483413696, + "epoch": 4.56, + "learning_rate": 2.720202874049028e-05, + "loss": 0.6966, + "step": 5394, + "task_loss": 0.5301749110221863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7935565710067749, + "epoch": 4.56, + "learning_rate": 2.7197802197802198e-05, + "loss": 0.508, + "step": 5395, + "task_loss": 0.9583966135978699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4155125916004181, + "epoch": 4.56, + "learning_rate": 2.7193575655114118e-05, + "loss": 0.527, + "step": 5396, + "task_loss": 0.5703842639923096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.596451461315155, + "epoch": 4.56, + "learning_rate": 2.7189349112426037e-05, + "loss": 0.629, + "step": 5397, + "task_loss": 1.1101837158203125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6796680688858032, + "epoch": 4.56, + "learning_rate": 2.7185122569737954e-05, + "loss": 0.6547, + "step": 5398, + "task_loss": 1.5831326246261597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2625083923339844, + "epoch": 4.56, + "learning_rate": 2.7180896027049873e-05, + "loss": 0.4714, + "step": 5399, + "task_loss": 0.16241151094436646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22443480789661407, + "epoch": 4.56, + "learning_rate": 2.7176669484361793e-05, + "loss": 0.4459, + "step": 5400, + "task_loss": 0.4830724895000458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6096628904342651, + "epoch": 4.57, + "learning_rate": 2.7172442941673716e-05, + "loss": 0.524, + "step": 5401, + "task_loss": 0.3656918406486511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5304520726203918, + "epoch": 4.57, + "learning_rate": 2.716821639898563e-05, + "loss": 0.5741, + "step": 5402, + "task_loss": 0.6311800479888916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7263672351837158, + "epoch": 4.57, + "learning_rate": 2.716398985629755e-05, + "loss": 0.5292, + "step": 5403, + "task_loss": 0.5756078958511353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.412069171667099, + "epoch": 4.57, + "learning_rate": 2.7159763313609472e-05, + "loss": 0.5627, + "step": 5404, + "task_loss": 0.8759124875068665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4952296018600464, + "epoch": 4.57, + "learning_rate": 2.7155536770921385e-05, + "loss": 0.4858, + "step": 5405, + "task_loss": 0.047556180506944656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3848026394844055, + "epoch": 4.57, + "learning_rate": 2.7151310228233305e-05, + "loss": 0.4534, + "step": 5406, + "task_loss": 1.5189660787582397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42357152700424194, + "epoch": 4.57, + "learning_rate": 2.7147083685545228e-05, + "loss": 0.4804, + "step": 5407, + "task_loss": 1.2142369747161865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7135859727859497, + "epoch": 4.57, + "learning_rate": 2.714285714285714e-05, + "loss": 0.633, + "step": 5408, + "task_loss": 0.8967861533164978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3272915184497833, + "epoch": 4.57, + "learning_rate": 2.7138630600169064e-05, + "loss": 0.6056, + "step": 5409, + "task_loss": 0.2025403529405594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1530689299106598, + "epoch": 4.57, + "learning_rate": 2.7134404057480984e-05, + "loss": 0.4313, + "step": 5410, + "task_loss": 0.26233577728271484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2940325438976288, + "epoch": 4.57, + "learning_rate": 2.7130177514792897e-05, + "loss": 0.549, + "step": 5411, + "task_loss": 0.31544169783592224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4194158911705017, + "epoch": 4.57, + "learning_rate": 2.712595097210482e-05, + "loss": 0.5072, + "step": 5412, + "task_loss": 1.0105589628219604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22945697605609894, + "epoch": 4.58, + "learning_rate": 2.712172442941674e-05, + "loss": 0.4961, + "step": 5413, + "task_loss": 0.27859506011009216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48602455854415894, + "epoch": 4.58, + "learning_rate": 2.711749788672866e-05, + "loss": 0.5022, + "step": 5414, + "task_loss": 0.3840574026107788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5287999510765076, + "epoch": 4.58, + "learning_rate": 2.7113271344040576e-05, + "loss": 0.4271, + "step": 5415, + "task_loss": 0.3990764617919922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2725510597229004, + "epoch": 4.58, + "learning_rate": 2.7109044801352495e-05, + "loss": 0.4651, + "step": 5416, + "task_loss": 0.3067554235458374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6128413081169128, + "epoch": 4.58, + "learning_rate": 2.7104818258664415e-05, + "loss": 0.6796, + "step": 5417, + "task_loss": 0.5863294005393982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4496791660785675, + "epoch": 4.58, + "learning_rate": 2.710059171597633e-05, + "loss": 0.598, + "step": 5418, + "task_loss": 0.6556788086891174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7718957662582397, + "epoch": 4.58, + "learning_rate": 2.709636517328825e-05, + "loss": 0.5533, + "step": 5419, + "task_loss": 1.0133272409439087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5502604842185974, + "epoch": 4.58, + "learning_rate": 2.709213863060017e-05, + "loss": 0.474, + "step": 5420, + "task_loss": 0.5062881112098694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6409262418746948, + "epoch": 4.58, + "learning_rate": 2.7087912087912087e-05, + "loss": 0.5903, + "step": 5421, + "task_loss": 1.1804314851760864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4186840057373047, + "epoch": 4.58, + "learning_rate": 2.7083685545224007e-05, + "loss": 0.5779, + "step": 5422, + "task_loss": 0.17031022906303406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20987167954444885, + "epoch": 4.58, + "learning_rate": 2.7079459002535927e-05, + "loss": 0.4459, + "step": 5423, + "task_loss": 0.5038543939590454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8138952255249023, + "epoch": 4.58, + "learning_rate": 2.7075232459847843e-05, + "loss": 0.6911, + "step": 5424, + "task_loss": 0.4157523810863495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4471595883369446, + "epoch": 4.59, + "learning_rate": 2.7071005917159763e-05, + "loss": 0.5376, + "step": 5425, + "task_loss": 0.5205883979797363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2984365224838257, + "epoch": 4.59, + "learning_rate": 2.7066779374471686e-05, + "loss": 0.3926, + "step": 5426, + "task_loss": 0.26941877603530884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5124970078468323, + "epoch": 4.59, + "learning_rate": 2.70625528317836e-05, + "loss": 0.4878, + "step": 5427, + "task_loss": 1.5197886228561401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7513706684112549, + "epoch": 4.59, + "learning_rate": 2.705832628909552e-05, + "loss": 0.5375, + "step": 5428, + "task_loss": 0.1762867122888565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47924187779426575, + "epoch": 4.59, + "learning_rate": 2.7054099746407442e-05, + "loss": 0.5484, + "step": 5429, + "task_loss": 0.4806961715221405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5714477300643921, + "epoch": 4.59, + "learning_rate": 2.704987320371936e-05, + "loss": 0.421, + "step": 5430, + "task_loss": 0.9996347427368164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46663421392440796, + "epoch": 4.59, + "learning_rate": 2.7045646661031278e-05, + "loss": 0.4469, + "step": 5431, + "task_loss": 0.48486167192459106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7124150395393372, + "epoch": 4.59, + "learning_rate": 2.7041420118343198e-05, + "loss": 0.4811, + "step": 5432, + "task_loss": 0.6984152793884277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3023144006729126, + "epoch": 4.59, + "learning_rate": 2.7037193575655117e-05, + "loss": 0.6099, + "step": 5433, + "task_loss": 0.5038154125213623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4609799385070801, + "epoch": 4.59, + "learning_rate": 2.7032967032967034e-05, + "loss": 0.6469, + "step": 5434, + "task_loss": 0.7658967971801758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9310113787651062, + "epoch": 4.59, + "learning_rate": 2.7028740490278954e-05, + "loss": 0.5694, + "step": 5435, + "task_loss": 1.0580936670303345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4861798882484436, + "epoch": 4.59, + "learning_rate": 2.7024513947590873e-05, + "loss": 0.55, + "step": 5436, + "task_loss": 1.2361242771148682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3817145824432373, + "epoch": 4.6, + "learning_rate": 2.702028740490279e-05, + "loss": 0.3634, + "step": 5437, + "task_loss": 0.9464032053947449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6354399919509888, + "epoch": 4.6, + "learning_rate": 2.701606086221471e-05, + "loss": 0.4569, + "step": 5438, + "task_loss": 0.7765830159187317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31423741579055786, + "epoch": 4.6, + "learning_rate": 2.701183431952663e-05, + "loss": 0.462, + "step": 5439, + "task_loss": 0.7643583416938782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7644920349121094, + "epoch": 4.6, + "learning_rate": 2.7007607776838545e-05, + "loss": 0.7352, + "step": 5440, + "task_loss": 0.6541059613227844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.624780535697937, + "epoch": 4.6, + "learning_rate": 2.7003381234150465e-05, + "loss": 0.5336, + "step": 5441, + "task_loss": 0.9501602053642273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5203424096107483, + "epoch": 4.6, + "learning_rate": 2.6999154691462385e-05, + "loss": 0.4896, + "step": 5442, + "task_loss": 0.8856216073036194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3527968227863312, + "epoch": 4.6, + "learning_rate": 2.6994928148774308e-05, + "loss": 0.4545, + "step": 5443, + "task_loss": 0.1643494814634323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.355309247970581, + "epoch": 4.6, + "learning_rate": 2.699070160608622e-05, + "loss": 0.8362, + "step": 5444, + "task_loss": 0.8305923938751221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3862878680229187, + "epoch": 4.6, + "learning_rate": 2.698647506339814e-05, + "loss": 0.4791, + "step": 5445, + "task_loss": 0.22194339334964752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9239904880523682, + "epoch": 4.6, + "learning_rate": 2.6982248520710064e-05, + "loss": 0.655, + "step": 5446, + "task_loss": 0.6507148146629333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4985984265804291, + "epoch": 4.6, + "learning_rate": 2.6978021978021977e-05, + "loss": 0.6384, + "step": 5447, + "task_loss": 0.453459769487381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4208281338214874, + "epoch": 4.6, + "learning_rate": 2.69737954353339e-05, + "loss": 0.5135, + "step": 5448, + "task_loss": 0.4005371332168579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7169245481491089, + "epoch": 4.61, + "learning_rate": 2.696956889264582e-05, + "loss": 0.6347, + "step": 5449, + "task_loss": 0.8415381908416748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31406426429748535, + "epoch": 4.61, + "learning_rate": 2.6965342349957733e-05, + "loss": 0.4154, + "step": 5450, + "task_loss": 0.44339028000831604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35602590441703796, + "epoch": 4.61, + "learning_rate": 2.6961115807269656e-05, + "loss": 0.4337, + "step": 5451, + "task_loss": 0.7294882535934448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5621517896652222, + "epoch": 4.61, + "learning_rate": 2.6956889264581576e-05, + "loss": 0.4816, + "step": 5452, + "task_loss": 0.8586968183517456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7009731531143188, + "epoch": 4.61, + "learning_rate": 2.695266272189349e-05, + "loss": 0.6647, + "step": 5453, + "task_loss": 1.2892886400222778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6520194411277771, + "epoch": 4.61, + "learning_rate": 2.694843617920541e-05, + "loss": 0.607, + "step": 5454, + "task_loss": 1.3366714715957642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4571188986301422, + "epoch": 4.61, + "learning_rate": 2.694420963651733e-05, + "loss": 0.6976, + "step": 5455, + "task_loss": 1.25129234790802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22270163893699646, + "epoch": 4.61, + "learning_rate": 2.6939983093829248e-05, + "loss": 0.5555, + "step": 5456, + "task_loss": 0.17719006538391113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5421428084373474, + "epoch": 4.61, + "learning_rate": 2.6935756551141167e-05, + "loss": 0.4608, + "step": 5457, + "task_loss": 1.2978284358978271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29250138998031616, + "epoch": 4.61, + "learning_rate": 2.6931530008453087e-05, + "loss": 0.4156, + "step": 5458, + "task_loss": 0.860486626625061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5911927223205566, + "epoch": 4.61, + "learning_rate": 2.6927303465765007e-05, + "loss": 0.6204, + "step": 5459, + "task_loss": 1.335366129875183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39149215817451477, + "epoch": 4.61, + "learning_rate": 2.6923076923076923e-05, + "loss": 0.5756, + "step": 5460, + "task_loss": 0.37639501690864563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2616904377937317, + "epoch": 4.62, + "learning_rate": 2.6918850380388843e-05, + "loss": 0.4442, + "step": 5461, + "task_loss": 0.34931880235671997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37292516231536865, + "epoch": 4.62, + "learning_rate": 2.6914623837700763e-05, + "loss": 0.6299, + "step": 5462, + "task_loss": 0.9685525298118591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6986571550369263, + "epoch": 4.62, + "learning_rate": 2.691039729501268e-05, + "loss": 0.4735, + "step": 5463, + "task_loss": 1.4629547595977783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3897111415863037, + "epoch": 4.62, + "learning_rate": 2.69061707523246e-05, + "loss": 0.4564, + "step": 5464, + "task_loss": 0.5439441204071045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24074490368366241, + "epoch": 4.62, + "learning_rate": 2.6901944209636522e-05, + "loss": 0.5041, + "step": 5465, + "task_loss": 0.38880655169487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36179739236831665, + "epoch": 4.62, + "learning_rate": 2.6897717666948435e-05, + "loss": 0.5335, + "step": 5466, + "task_loss": 0.8857415318489075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6761791706085205, + "epoch": 4.62, + "learning_rate": 2.6893491124260355e-05, + "loss": 0.4016, + "step": 5467, + "task_loss": 0.8081907033920288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.654042661190033, + "epoch": 4.62, + "learning_rate": 2.6889264581572278e-05, + "loss": 0.5364, + "step": 5468, + "task_loss": 0.15242618322372437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2569422721862793, + "epoch": 4.62, + "learning_rate": 2.688503803888419e-05, + "loss": 0.4691, + "step": 5469, + "task_loss": 0.09678167849779129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45545610785484314, + "epoch": 4.62, + "learning_rate": 2.688081149619611e-05, + "loss": 0.4192, + "step": 5470, + "task_loss": 0.7507992386817932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5276072025299072, + "epoch": 4.62, + "learning_rate": 2.6876584953508034e-05, + "loss": 0.5169, + "step": 5471, + "task_loss": 0.18511104583740234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.566360354423523, + "epoch": 4.63, + "learning_rate": 2.6872358410819953e-05, + "loss": 0.5259, + "step": 5472, + "task_loss": 0.8933319449424744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.195001482963562, + "epoch": 4.63, + "learning_rate": 2.686813186813187e-05, + "loss": 0.6434, + "step": 5473, + "task_loss": 0.6887006759643555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5327467918395996, + "epoch": 4.63, + "learning_rate": 2.686390532544379e-05, + "loss": 0.5658, + "step": 5474, + "task_loss": 0.6111761331558228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4131084978580475, + "epoch": 4.63, + "learning_rate": 2.685967878275571e-05, + "loss": 0.4146, + "step": 5475, + "task_loss": 0.3020959794521332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29281583428382874, + "epoch": 4.63, + "learning_rate": 2.6855452240067626e-05, + "loss": 0.6726, + "step": 5476, + "task_loss": 0.9623677730560303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42006751894950867, + "epoch": 4.63, + "learning_rate": 2.6851225697379545e-05, + "loss": 0.577, + "step": 5477, + "task_loss": 0.9024275541305542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5457375645637512, + "epoch": 4.63, + "learning_rate": 2.6846999154691465e-05, + "loss": 0.6219, + "step": 5478, + "task_loss": 0.8965764045715332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.602469265460968, + "epoch": 4.63, + "learning_rate": 2.684277261200338e-05, + "loss": 0.538, + "step": 5479, + "task_loss": 0.6006090641021729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8495910167694092, + "epoch": 4.63, + "learning_rate": 2.68385460693153e-05, + "loss": 0.7306, + "step": 5480, + "task_loss": 0.9474236965179443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3784373998641968, + "epoch": 4.63, + "learning_rate": 2.683431952662722e-05, + "loss": 0.4833, + "step": 5481, + "task_loss": 0.42260465025901794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42259857058525085, + "epoch": 4.63, + "learning_rate": 2.6830092983939137e-05, + "loss": 0.359, + "step": 5482, + "task_loss": 0.29643863439559937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2174834907054901, + "epoch": 4.63, + "learning_rate": 2.6825866441251057e-05, + "loss": 0.4274, + "step": 5483, + "task_loss": 1.445667028427124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3452601730823517, + "epoch": 4.64, + "learning_rate": 2.6821639898562977e-05, + "loss": 0.4276, + "step": 5484, + "task_loss": 0.5791153907775879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5632915496826172, + "epoch": 4.64, + "learning_rate": 2.6817413355874893e-05, + "loss": 0.566, + "step": 5485, + "task_loss": 0.7125264406204224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6186407208442688, + "epoch": 4.64, + "learning_rate": 2.6813186813186813e-05, + "loss": 0.4919, + "step": 5486, + "task_loss": 0.7539494037628174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6099252700805664, + "epoch": 4.64, + "learning_rate": 2.6808960270498733e-05, + "loss": 0.5985, + "step": 5487, + "task_loss": 0.6449882388114929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2927268445491791, + "epoch": 4.64, + "learning_rate": 2.6804733727810656e-05, + "loss": 0.4786, + "step": 5488, + "task_loss": 0.7286347150802612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.480973482131958, + "epoch": 4.64, + "learning_rate": 2.680050718512257e-05, + "loss": 0.4913, + "step": 5489, + "task_loss": 0.49751967191696167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6632481813430786, + "epoch": 4.64, + "learning_rate": 2.6796280642434492e-05, + "loss": 0.4439, + "step": 5490, + "task_loss": 1.1208927631378174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4094443917274475, + "epoch": 4.64, + "learning_rate": 2.679205409974641e-05, + "loss": 0.5099, + "step": 5491, + "task_loss": 0.5087623000144958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5138392448425293, + "epoch": 4.64, + "learning_rate": 2.6787827557058324e-05, + "loss": 0.5412, + "step": 5492, + "task_loss": 0.5152201056480408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.552590012550354, + "epoch": 4.64, + "learning_rate": 2.6783601014370248e-05, + "loss": 0.5042, + "step": 5493, + "task_loss": 0.4980115294456482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.587933361530304, + "epoch": 4.64, + "learning_rate": 2.6779374471682167e-05, + "loss": 0.5931, + "step": 5494, + "task_loss": 0.7215527296066284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.607820987701416, + "epoch": 4.64, + "learning_rate": 2.6775147928994084e-05, + "loss": 0.5731, + "step": 5495, + "task_loss": 0.9025735259056091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3524215519428253, + "epoch": 4.65, + "learning_rate": 2.6770921386306003e-05, + "loss": 0.4171, + "step": 5496, + "task_loss": 1.23963463306427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3597066402435303, + "epoch": 4.65, + "learning_rate": 2.6766694843617923e-05, + "loss": 0.4827, + "step": 5497, + "task_loss": 0.9997002482414246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3568958044052124, + "epoch": 4.65, + "learning_rate": 2.676246830092984e-05, + "loss": 0.3837, + "step": 5498, + "task_loss": 1.1404331922531128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3596614599227905, + "epoch": 4.65, + "learning_rate": 2.675824175824176e-05, + "loss": 0.2812, + "step": 5499, + "task_loss": 0.29628556966781616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43777722120285034, + "epoch": 4.65, + "learning_rate": 2.675401521555368e-05, + "loss": 0.4893, + "step": 5500, + "task_loss": 0.7023282051086426 + }, + { + "epoch": 4.65, + "eval_accuracy": 0.9026138613861386, + "eval_loss": 0.3561651408672333, + "eval_runtime": 228.1759, + "eval_samples_per_second": 110.66, + "eval_steps_per_second": 0.868, + "step": 5500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4523773789405823, + "epoch": 4.65, + "learning_rate": 2.67497886728656e-05, + "loss": 0.6071, + "step": 5501, + "task_loss": 0.3969096541404724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7608723044395447, + "epoch": 4.65, + "learning_rate": 2.6745562130177515e-05, + "loss": 0.547, + "step": 5502, + "task_loss": 0.8551808595657349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31090399622917175, + "epoch": 4.65, + "learning_rate": 2.6741335587489435e-05, + "loss": 0.4405, + "step": 5503, + "task_loss": 1.0763027667999268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5198166370391846, + "epoch": 4.65, + "learning_rate": 2.6737109044801355e-05, + "loss": 0.5792, + "step": 5504, + "task_loss": 0.6711432933807373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2607373297214508, + "epoch": 4.65, + "learning_rate": 2.673288250211327e-05, + "loss": 0.5798, + "step": 5505, + "task_loss": 0.37070998549461365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43698394298553467, + "epoch": 4.65, + "learning_rate": 2.672865595942519e-05, + "loss": 0.3589, + "step": 5506, + "task_loss": 0.3279370367527008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5569877624511719, + "epoch": 4.65, + "learning_rate": 2.6724429416737114e-05, + "loss": 0.499, + "step": 5507, + "task_loss": 0.5627281665802002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48648685216903687, + "epoch": 4.66, + "learning_rate": 2.6720202874049027e-05, + "loss": 0.5581, + "step": 5508, + "task_loss": 1.4164979457855225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5226390361785889, + "epoch": 4.66, + "learning_rate": 2.6715976331360946e-05, + "loss": 0.5408, + "step": 5509, + "task_loss": 0.7491970658302307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3967892825603485, + "epoch": 4.66, + "learning_rate": 2.671174978867287e-05, + "loss": 0.4699, + "step": 5510, + "task_loss": 0.6714504361152649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5288428664207458, + "epoch": 4.66, + "learning_rate": 2.6707523245984783e-05, + "loss": 0.5784, + "step": 5511, + "task_loss": 0.5653752684593201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2977157235145569, + "epoch": 4.66, + "learning_rate": 2.6703296703296706e-05, + "loss": 0.5335, + "step": 5512, + "task_loss": 0.02223970927298069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6852577328681946, + "epoch": 4.66, + "learning_rate": 2.6699070160608625e-05, + "loss": 0.5361, + "step": 5513, + "task_loss": 0.4608360826969147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4037739932537079, + "epoch": 4.66, + "learning_rate": 2.669484361792054e-05, + "loss": 0.4604, + "step": 5514, + "task_loss": 0.48674654960632324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.317607581615448, + "epoch": 4.66, + "learning_rate": 2.669061707523246e-05, + "loss": 0.52, + "step": 5515, + "task_loss": 0.46464473009109497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27266260981559753, + "epoch": 4.66, + "learning_rate": 2.668639053254438e-05, + "loss": 0.4148, + "step": 5516, + "task_loss": 0.6932034492492676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4652734398841858, + "epoch": 4.66, + "learning_rate": 2.66821639898563e-05, + "loss": 0.5323, + "step": 5517, + "task_loss": 0.5104216933250427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7152064442634583, + "epoch": 4.66, + "learning_rate": 2.6677937447168217e-05, + "loss": 0.494, + "step": 5518, + "task_loss": 0.8032392263412476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6222490072250366, + "epoch": 4.66, + "learning_rate": 2.6673710904480137e-05, + "loss": 0.6388, + "step": 5519, + "task_loss": 0.26591697335243225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39413371682167053, + "epoch": 4.67, + "learning_rate": 2.6669484361792057e-05, + "loss": 0.4606, + "step": 5520, + "task_loss": 0.651878297328949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32757729291915894, + "epoch": 4.67, + "learning_rate": 2.6665257819103973e-05, + "loss": 0.3725, + "step": 5521, + "task_loss": 0.1749047487974167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41070133447647095, + "epoch": 4.67, + "learning_rate": 2.6661031276415893e-05, + "loss": 0.4867, + "step": 5522, + "task_loss": 0.613288164138794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4388280212879181, + "epoch": 4.67, + "learning_rate": 2.6656804733727813e-05, + "loss": 0.5016, + "step": 5523, + "task_loss": 0.6644706130027771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8439745903015137, + "epoch": 4.67, + "learning_rate": 2.665257819103973e-05, + "loss": 0.6902, + "step": 5524, + "task_loss": 0.130574032664299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3907517194747925, + "epoch": 4.67, + "learning_rate": 2.664835164835165e-05, + "loss": 0.5417, + "step": 5525, + "task_loss": 0.33565089106559753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41575074195861816, + "epoch": 4.67, + "learning_rate": 2.664412510566357e-05, + "loss": 0.5337, + "step": 5526, + "task_loss": 0.46007034182548523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6557846069335938, + "epoch": 4.67, + "learning_rate": 2.6639898562975485e-05, + "loss": 0.5275, + "step": 5527, + "task_loss": 1.5052037239074707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5499823689460754, + "epoch": 4.67, + "learning_rate": 2.6635672020287405e-05, + "loss": 0.4302, + "step": 5528, + "task_loss": 0.8446851968765259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2966265082359314, + "epoch": 4.67, + "learning_rate": 2.6631445477599328e-05, + "loss": 0.5629, + "step": 5529, + "task_loss": 0.1258809119462967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5064107179641724, + "epoch": 4.67, + "learning_rate": 2.6627218934911247e-05, + "loss": 0.6413, + "step": 5530, + "task_loss": 0.6399808526039124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5774915218353271, + "epoch": 4.67, + "learning_rate": 2.662299239222316e-05, + "loss": 0.55, + "step": 5531, + "task_loss": 0.8530387878417969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5741387605667114, + "epoch": 4.68, + "learning_rate": 2.6618765849535084e-05, + "loss": 0.5026, + "step": 5532, + "task_loss": 0.8792498707771301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5797810554504395, + "epoch": 4.68, + "learning_rate": 2.6614539306847003e-05, + "loss": 0.4534, + "step": 5533, + "task_loss": 0.06427767872810364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35390692949295044, + "epoch": 4.68, + "learning_rate": 2.6610312764158916e-05, + "loss": 0.4629, + "step": 5534, + "task_loss": 0.5747318267822266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22252270579338074, + "epoch": 4.68, + "learning_rate": 2.660608622147084e-05, + "loss": 0.5484, + "step": 5535, + "task_loss": 0.3115277588367462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35737746953964233, + "epoch": 4.68, + "learning_rate": 2.660185967878276e-05, + "loss": 0.4069, + "step": 5536, + "task_loss": 1.1496926546096802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7334063053131104, + "epoch": 4.68, + "learning_rate": 2.6597633136094675e-05, + "loss": 0.426, + "step": 5537, + "task_loss": 0.9971699714660645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.433099627494812, + "epoch": 4.68, + "learning_rate": 2.6593406593406595e-05, + "loss": 0.4768, + "step": 5538, + "task_loss": 0.44010573625564575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35387203097343445, + "epoch": 4.68, + "learning_rate": 2.6589180050718515e-05, + "loss": 0.5987, + "step": 5539, + "task_loss": 1.3987623453140259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4783155918121338, + "epoch": 4.68, + "learning_rate": 2.658495350803043e-05, + "loss": 0.7451, + "step": 5540, + "task_loss": 0.46355360746383667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32887375354766846, + "epoch": 4.68, + "learning_rate": 2.658072696534235e-05, + "loss": 0.3279, + "step": 5541, + "task_loss": 0.37576723098754883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2954345941543579, + "epoch": 4.68, + "learning_rate": 2.657650042265427e-05, + "loss": 0.5132, + "step": 5542, + "task_loss": 1.1704410314559937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38812386989593506, + "epoch": 4.69, + "learning_rate": 2.6572273879966187e-05, + "loss": 0.5396, + "step": 5543, + "task_loss": 1.120713233947754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36863604187965393, + "epoch": 4.69, + "learning_rate": 2.6568047337278107e-05, + "loss": 0.5563, + "step": 5544, + "task_loss": 0.40240323543548584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2079882025718689, + "epoch": 4.69, + "learning_rate": 2.6563820794590027e-05, + "loss": 0.2734, + "step": 5545, + "task_loss": 0.45972466468811035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5143852233886719, + "epoch": 4.69, + "learning_rate": 2.655959425190195e-05, + "loss": 0.5082, + "step": 5546, + "task_loss": 0.5054423213005066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3416542410850525, + "epoch": 4.69, + "learning_rate": 2.6555367709213863e-05, + "loss": 0.3293, + "step": 5547, + "task_loss": 0.4533216953277588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4784584939479828, + "epoch": 4.69, + "learning_rate": 2.6551141166525782e-05, + "loss": 0.646, + "step": 5548, + "task_loss": 1.083616018295288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29557737708091736, + "epoch": 4.69, + "learning_rate": 2.6546914623837706e-05, + "loss": 0.3529, + "step": 5549, + "task_loss": 0.8247396945953369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.933720588684082, + "epoch": 4.69, + "learning_rate": 2.654268808114962e-05, + "loss": 0.7319, + "step": 5550, + "task_loss": 0.4455767273902893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43065202236175537, + "epoch": 4.69, + "learning_rate": 2.6538461538461538e-05, + "loss": 0.4658, + "step": 5551, + "task_loss": 0.6591951847076416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5519582033157349, + "epoch": 4.69, + "learning_rate": 2.653423499577346e-05, + "loss": 0.4613, + "step": 5552, + "task_loss": 0.8736343383789062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42683807015419006, + "epoch": 4.69, + "learning_rate": 2.6530008453085374e-05, + "loss": 0.5368, + "step": 5553, + "task_loss": 1.297966718673706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41083237528800964, + "epoch": 4.69, + "learning_rate": 2.6525781910397297e-05, + "loss": 0.5072, + "step": 5554, + "task_loss": 0.28636035323143005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5030954480171204, + "epoch": 4.7, + "learning_rate": 2.6521555367709217e-05, + "loss": 0.4919, + "step": 5555, + "task_loss": 0.4310172200202942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.513940691947937, + "epoch": 4.7, + "learning_rate": 2.651732882502113e-05, + "loss": 0.5688, + "step": 5556, + "task_loss": 0.609089195728302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4849684536457062, + "epoch": 4.7, + "learning_rate": 2.6513102282333053e-05, + "loss": 0.5868, + "step": 5557, + "task_loss": 0.34574028849601746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30403852462768555, + "epoch": 4.7, + "learning_rate": 2.6508875739644973e-05, + "loss": 0.5085, + "step": 5558, + "task_loss": 0.4700937569141388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4608495831489563, + "epoch": 4.7, + "learning_rate": 2.6504649196956893e-05, + "loss": 0.4697, + "step": 5559, + "task_loss": 0.6089870929718018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38940346240997314, + "epoch": 4.7, + "learning_rate": 2.650042265426881e-05, + "loss": 0.5475, + "step": 5560, + "task_loss": 0.7848317623138428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37892967462539673, + "epoch": 4.7, + "learning_rate": 2.649619611158073e-05, + "loss": 0.4604, + "step": 5561, + "task_loss": 0.572037935256958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7236365079879761, + "epoch": 4.7, + "learning_rate": 2.649196956889265e-05, + "loss": 0.6131, + "step": 5562, + "task_loss": 0.6075588464736938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5459661483764648, + "epoch": 4.7, + "learning_rate": 2.6487743026204565e-05, + "loss": 0.5209, + "step": 5563, + "task_loss": 0.5323628783226013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5334126353263855, + "epoch": 4.7, + "learning_rate": 2.6483516483516485e-05, + "loss": 0.5249, + "step": 5564, + "task_loss": 0.1525871753692627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3653854727745056, + "epoch": 4.7, + "learning_rate": 2.6479289940828404e-05, + "loss": 0.5442, + "step": 5565, + "task_loss": 0.5842914581298828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37355881929397583, + "epoch": 4.7, + "learning_rate": 2.647506339814032e-05, + "loss": 0.6531, + "step": 5566, + "task_loss": 0.5938406586647034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5712457895278931, + "epoch": 4.71, + "learning_rate": 2.647083685545224e-05, + "loss": 0.4361, + "step": 5567, + "task_loss": 0.5165548324584961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7099549770355225, + "epoch": 4.71, + "learning_rate": 2.646661031276416e-05, + "loss": 0.6888, + "step": 5568, + "task_loss": 0.23085536062717438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6301600337028503, + "epoch": 4.71, + "learning_rate": 2.6462383770076077e-05, + "loss": 0.6003, + "step": 5569, + "task_loss": 0.5380938649177551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4351697564125061, + "epoch": 4.71, + "learning_rate": 2.6458157227387996e-05, + "loss": 0.4229, + "step": 5570, + "task_loss": 0.7270271182060242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.771167516708374, + "epoch": 4.71, + "learning_rate": 2.645393068469992e-05, + "loss": 0.6589, + "step": 5571, + "task_loss": 0.7263500094413757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8471298217773438, + "epoch": 4.71, + "learning_rate": 2.6449704142011832e-05, + "loss": 0.6533, + "step": 5572, + "task_loss": 1.0060794353485107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6487619280815125, + "epoch": 4.71, + "learning_rate": 2.6445477599323752e-05, + "loss": 0.5162, + "step": 5573, + "task_loss": 0.6963391304016113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6797330975532532, + "epoch": 4.71, + "learning_rate": 2.6441251056635675e-05, + "loss": 0.4536, + "step": 5574, + "task_loss": 0.780808687210083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5461999177932739, + "epoch": 4.71, + "learning_rate": 2.6437024513947595e-05, + "loss": 0.5778, + "step": 5575, + "task_loss": 1.1536657810211182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5113688707351685, + "epoch": 4.71, + "learning_rate": 2.643279797125951e-05, + "loss": 0.5099, + "step": 5576, + "task_loss": 0.2360696941614151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4419170320034027, + "epoch": 4.71, + "learning_rate": 2.642857142857143e-05, + "loss": 0.5226, + "step": 5577, + "task_loss": 0.824263870716095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5355842709541321, + "epoch": 4.71, + "learning_rate": 2.642434488588335e-05, + "loss": 0.5348, + "step": 5578, + "task_loss": 0.70827716588974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8563870787620544, + "epoch": 4.72, + "learning_rate": 2.6420118343195267e-05, + "loss": 0.6513, + "step": 5579, + "task_loss": 1.5571237802505493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36583828926086426, + "epoch": 4.72, + "learning_rate": 2.6415891800507187e-05, + "loss": 0.5045, + "step": 5580, + "task_loss": 0.10022090375423431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2370641678571701, + "epoch": 4.72, + "learning_rate": 2.6411665257819107e-05, + "loss": 0.5292, + "step": 5581, + "task_loss": 0.14611811935901642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3379738926887512, + "epoch": 4.72, + "learning_rate": 2.6407438715131023e-05, + "loss": 0.4644, + "step": 5582, + "task_loss": 0.4610328674316406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7449734807014465, + "epoch": 4.72, + "learning_rate": 2.6403212172442943e-05, + "loss": 0.7266, + "step": 5583, + "task_loss": 0.6059086918830872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.518571674823761, + "epoch": 4.72, + "learning_rate": 2.6398985629754862e-05, + "loss": 0.4909, + "step": 5584, + "task_loss": 1.381953239440918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.572045087814331, + "epoch": 4.72, + "learning_rate": 2.639475908706678e-05, + "loss": 0.4748, + "step": 5585, + "task_loss": 1.100764274597168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49839168787002563, + "epoch": 4.72, + "learning_rate": 2.63905325443787e-05, + "loss": 0.5934, + "step": 5586, + "task_loss": 0.5220285058021545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3297421932220459, + "epoch": 4.72, + "learning_rate": 2.638630600169062e-05, + "loss": 0.5343, + "step": 5587, + "task_loss": 0.38556134700775146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45048609375953674, + "epoch": 4.72, + "learning_rate": 2.6382079459002535e-05, + "loss": 0.5049, + "step": 5588, + "task_loss": 1.2399929761886597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5687627196311951, + "epoch": 4.72, + "learning_rate": 2.6377852916314454e-05, + "loss": 0.5119, + "step": 5589, + "task_loss": 0.4569653272628784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3494449853897095, + "epoch": 4.72, + "learning_rate": 2.6373626373626374e-05, + "loss": 0.3949, + "step": 5590, + "task_loss": 0.20925568044185638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5153350234031677, + "epoch": 4.73, + "learning_rate": 2.6369399830938297e-05, + "loss": 0.5013, + "step": 5591, + "task_loss": 1.0863345861434937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38383543491363525, + "epoch": 4.73, + "learning_rate": 2.636517328825021e-05, + "loss": 0.4107, + "step": 5592, + "task_loss": 0.7033401727676392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5520209670066833, + "epoch": 4.73, + "learning_rate": 2.6360946745562133e-05, + "loss": 0.4617, + "step": 5593, + "task_loss": 1.207166314125061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5075170993804932, + "epoch": 4.73, + "learning_rate": 2.6356720202874053e-05, + "loss": 0.6318, + "step": 5594, + "task_loss": 0.9788430333137512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41028743982315063, + "epoch": 4.73, + "learning_rate": 2.6352493660185966e-05, + "loss": 0.5641, + "step": 5595, + "task_loss": 0.4345182478427887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4060463309288025, + "epoch": 4.73, + "learning_rate": 2.634826711749789e-05, + "loss": 0.5916, + "step": 5596, + "task_loss": 0.47377336025238037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5743918418884277, + "epoch": 4.73, + "learning_rate": 2.634404057480981e-05, + "loss": 0.551, + "step": 5597, + "task_loss": 1.3714823722839355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7340099811553955, + "epoch": 4.73, + "learning_rate": 2.6339814032121722e-05, + "loss": 0.5147, + "step": 5598, + "task_loss": 0.7502601742744446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5614179968833923, + "epoch": 4.73, + "learning_rate": 2.6335587489433645e-05, + "loss": 0.4893, + "step": 5599, + "task_loss": 0.637708842754364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5504275560379028, + "epoch": 4.73, + "learning_rate": 2.6331360946745565e-05, + "loss": 0.7383, + "step": 5600, + "task_loss": 0.8209319114685059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37135112285614014, + "epoch": 4.73, + "learning_rate": 2.632713440405748e-05, + "loss": 0.3424, + "step": 5601, + "task_loss": 0.453311026096344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3217511475086212, + "epoch": 4.73, + "learning_rate": 2.63229078613694e-05, + "loss": 0.4112, + "step": 5602, + "task_loss": 1.0388603210449219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6180079579353333, + "epoch": 4.74, + "learning_rate": 2.631868131868132e-05, + "loss": 0.5138, + "step": 5603, + "task_loss": 0.09587042033672333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36335116624832153, + "epoch": 4.74, + "learning_rate": 2.631445477599324e-05, + "loss": 0.4446, + "step": 5604, + "task_loss": 0.30712229013442993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6665881872177124, + "epoch": 4.74, + "learning_rate": 2.6310228233305157e-05, + "loss": 0.5388, + "step": 5605, + "task_loss": 1.4466938972473145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5067758560180664, + "epoch": 4.74, + "learning_rate": 2.6306001690617076e-05, + "loss": 0.6159, + "step": 5606, + "task_loss": 0.5833654999732971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6314814686775208, + "epoch": 4.74, + "learning_rate": 2.6301775147928996e-05, + "loss": 0.606, + "step": 5607, + "task_loss": 0.12786057591438293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.675163745880127, + "epoch": 4.74, + "learning_rate": 2.6297548605240912e-05, + "loss": 0.5101, + "step": 5608, + "task_loss": 1.256523609161377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45086413621902466, + "epoch": 4.74, + "learning_rate": 2.6293322062552832e-05, + "loss": 0.6938, + "step": 5609, + "task_loss": 0.2894653081893921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6532406806945801, + "epoch": 4.74, + "learning_rate": 2.6289095519864755e-05, + "loss": 0.5424, + "step": 5610, + "task_loss": 1.1959331035614014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5219150185585022, + "epoch": 4.74, + "learning_rate": 2.628486897717667e-05, + "loss": 0.5459, + "step": 5611, + "task_loss": 0.8497066497802734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7167882919311523, + "epoch": 4.74, + "learning_rate": 2.6280642434488588e-05, + "loss": 0.5655, + "step": 5612, + "task_loss": 1.190349817276001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4336186349391937, + "epoch": 4.74, + "learning_rate": 2.627641589180051e-05, + "loss": 0.3743, + "step": 5613, + "task_loss": 0.39047151803970337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5110586285591125, + "epoch": 4.75, + "learning_rate": 2.6272189349112424e-05, + "loss": 0.5845, + "step": 5614, + "task_loss": 0.6457735896110535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5020008683204651, + "epoch": 4.75, + "learning_rate": 2.6267962806424344e-05, + "loss": 0.5204, + "step": 5615, + "task_loss": 1.2768497467041016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8477251529693604, + "epoch": 4.75, + "learning_rate": 2.6263736263736267e-05, + "loss": 0.5867, + "step": 5616, + "task_loss": 1.039192795753479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5431690216064453, + "epoch": 4.75, + "learning_rate": 2.625950972104818e-05, + "loss": 0.625, + "step": 5617, + "task_loss": 1.0950745344161987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.363947331905365, + "epoch": 4.75, + "learning_rate": 2.6255283178360103e-05, + "loss": 0.409, + "step": 5618, + "task_loss": 0.39402270317077637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40578022599220276, + "epoch": 4.75, + "learning_rate": 2.6251056635672023e-05, + "loss": 0.6084, + "step": 5619, + "task_loss": 1.3617318868637085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6474149823188782, + "epoch": 4.75, + "learning_rate": 2.6246830092983943e-05, + "loss": 0.4541, + "step": 5620, + "task_loss": 0.48807433247566223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5947340130805969, + "epoch": 4.75, + "learning_rate": 2.624260355029586e-05, + "loss": 0.5067, + "step": 5621, + "task_loss": 0.968641459941864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0875134468078613, + "epoch": 4.75, + "learning_rate": 2.623837700760778e-05, + "loss": 0.6976, + "step": 5622, + "task_loss": 0.7974452972412109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5867829322814941, + "epoch": 4.75, + "learning_rate": 2.62341504649197e-05, + "loss": 0.4656, + "step": 5623, + "task_loss": 0.5273624062538147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4523177742958069, + "epoch": 4.75, + "learning_rate": 2.6229923922231615e-05, + "loss": 0.5299, + "step": 5624, + "task_loss": 0.8055267930030823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37007904052734375, + "epoch": 4.75, + "learning_rate": 2.6225697379543535e-05, + "loss": 0.5727, + "step": 5625, + "task_loss": 0.7688056230545044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6906920671463013, + "epoch": 4.76, + "learning_rate": 2.6221470836855454e-05, + "loss": 0.5354, + "step": 5626, + "task_loss": 1.0719765424728394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42443329095840454, + "epoch": 4.76, + "learning_rate": 2.621724429416737e-05, + "loss": 0.6551, + "step": 5627, + "task_loss": 0.6896079182624817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4050268530845642, + "epoch": 4.76, + "learning_rate": 2.621301775147929e-05, + "loss": 0.3016, + "step": 5628, + "task_loss": 0.46010303497314453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.780997097492218, + "epoch": 4.76, + "learning_rate": 2.620879120879121e-05, + "loss": 0.7151, + "step": 5629, + "task_loss": 0.807028591632843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5751667022705078, + "epoch": 4.76, + "learning_rate": 2.6204564666103126e-05, + "loss": 0.5585, + "step": 5630, + "task_loss": 0.7727732062339783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6906204223632812, + "epoch": 4.76, + "learning_rate": 2.6200338123415046e-05, + "loss": 0.6195, + "step": 5631, + "task_loss": 0.48975932598114014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4904405474662781, + "epoch": 4.76, + "learning_rate": 2.6196111580726966e-05, + "loss": 0.5136, + "step": 5632, + "task_loss": 0.41775432229042053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31652069091796875, + "epoch": 4.76, + "learning_rate": 2.619188503803889e-05, + "loss": 0.352, + "step": 5633, + "task_loss": 0.2876855432987213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5669068098068237, + "epoch": 4.76, + "learning_rate": 2.6187658495350802e-05, + "loss": 0.5618, + "step": 5634, + "task_loss": 1.006703495979309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3489764928817749, + "epoch": 4.76, + "learning_rate": 2.6183431952662725e-05, + "loss": 0.4251, + "step": 5635, + "task_loss": 0.7292515635490417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4997534155845642, + "epoch": 4.76, + "learning_rate": 2.6179205409974645e-05, + "loss": 0.5169, + "step": 5636, + "task_loss": 0.5023015737533569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9334338307380676, + "epoch": 4.76, + "learning_rate": 2.6174978867286558e-05, + "loss": 0.6109, + "step": 5637, + "task_loss": 0.6112658381462097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4933055639266968, + "epoch": 4.77, + "learning_rate": 2.617075232459848e-05, + "loss": 0.6851, + "step": 5638, + "task_loss": 0.6476717591285706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4700254201889038, + "epoch": 4.77, + "learning_rate": 2.61665257819104e-05, + "loss": 0.4386, + "step": 5639, + "task_loss": 0.898995041847229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6200739741325378, + "epoch": 4.77, + "learning_rate": 2.6162299239222317e-05, + "loss": 0.5285, + "step": 5640, + "task_loss": 1.8384853601455688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.626653254032135, + "epoch": 4.77, + "learning_rate": 2.6158072696534237e-05, + "loss": 0.6014, + "step": 5641, + "task_loss": 0.7722612023353577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20563089847564697, + "epoch": 4.77, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.5374, + "step": 5642, + "task_loss": 0.4498206377029419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7185769081115723, + "epoch": 4.77, + "learning_rate": 2.6149619611158073e-05, + "loss": 0.602, + "step": 5643, + "task_loss": 0.7585304975509644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2892688512802124, + "epoch": 4.77, + "learning_rate": 2.6145393068469993e-05, + "loss": 0.5533, + "step": 5644, + "task_loss": 0.6774147748947144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5541114807128906, + "epoch": 4.77, + "learning_rate": 2.6141166525781912e-05, + "loss": 0.4698, + "step": 5645, + "task_loss": 1.1297212839126587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28559190034866333, + "epoch": 4.77, + "learning_rate": 2.613693998309383e-05, + "loss": 0.4665, + "step": 5646, + "task_loss": 0.3137655556201935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36219853162765503, + "epoch": 4.77, + "learning_rate": 2.613271344040575e-05, + "loss": 0.6662, + "step": 5647, + "task_loss": 0.839945912361145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5286502242088318, + "epoch": 4.77, + "learning_rate": 2.6128486897717668e-05, + "loss": 0.5826, + "step": 5648, + "task_loss": 0.5320906043052673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36559855937957764, + "epoch": 4.77, + "learning_rate": 2.6124260355029588e-05, + "loss": 0.5939, + "step": 5649, + "task_loss": 0.5208147764205933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40268534421920776, + "epoch": 4.78, + "learning_rate": 2.6120033812341504e-05, + "loss": 0.485, + "step": 5650, + "task_loss": 0.2845090627670288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2513153553009033, + "epoch": 4.78, + "learning_rate": 2.6115807269653424e-05, + "loss": 0.443, + "step": 5651, + "task_loss": 0.1188097819685936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23720800876617432, + "epoch": 4.78, + "learning_rate": 2.6111580726965347e-05, + "loss": 0.397, + "step": 5652, + "task_loss": 1.0551912784576416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2911929488182068, + "epoch": 4.78, + "learning_rate": 2.610735418427726e-05, + "loss": 0.4006, + "step": 5653, + "task_loss": 0.6315035223960876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43509918451309204, + "epoch": 4.78, + "learning_rate": 2.610312764158918e-05, + "loss": 0.4368, + "step": 5654, + "task_loss": 0.22782652080059052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7625494003295898, + "epoch": 4.78, + "learning_rate": 2.6098901098901103e-05, + "loss": 0.6781, + "step": 5655, + "task_loss": 1.1475688219070435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3548119068145752, + "epoch": 4.78, + "learning_rate": 2.6094674556213016e-05, + "loss": 0.5091, + "step": 5656, + "task_loss": 1.0246869325637817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4572964012622833, + "epoch": 4.78, + "learning_rate": 2.609044801352494e-05, + "loss": 0.6032, + "step": 5657, + "task_loss": 0.47745344042778015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6380445957183838, + "epoch": 4.78, + "learning_rate": 2.608622147083686e-05, + "loss": 0.6741, + "step": 5658, + "task_loss": 1.09572434425354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8480476140975952, + "epoch": 4.78, + "learning_rate": 2.6081994928148772e-05, + "loss": 0.5767, + "step": 5659, + "task_loss": 0.14236420392990112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5163456201553345, + "epoch": 4.78, + "learning_rate": 2.6077768385460695e-05, + "loss": 0.5335, + "step": 5660, + "task_loss": 0.6128365993499756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2095562219619751, + "epoch": 4.78, + "learning_rate": 2.6073541842772615e-05, + "loss": 0.5052, + "step": 5661, + "task_loss": 0.4599507451057434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5721461772918701, + "epoch": 4.79, + "learning_rate": 2.6069315300084534e-05, + "loss": 0.5359, + "step": 5662, + "task_loss": 0.14122001826763153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.548396110534668, + "epoch": 4.79, + "learning_rate": 2.606508875739645e-05, + "loss": 0.4802, + "step": 5663, + "task_loss": 0.3679597079753876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6143293976783752, + "epoch": 4.79, + "learning_rate": 2.606086221470837e-05, + "loss": 0.5078, + "step": 5664, + "task_loss": 0.9742221832275391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3170853555202484, + "epoch": 4.79, + "learning_rate": 2.605663567202029e-05, + "loss": 0.4282, + "step": 5665, + "task_loss": 0.6404563188552856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7089065909385681, + "epoch": 4.79, + "learning_rate": 2.6052409129332207e-05, + "loss": 0.6001, + "step": 5666, + "task_loss": 0.19910073280334473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3767755329608917, + "epoch": 4.79, + "learning_rate": 2.6048182586644126e-05, + "loss": 0.4552, + "step": 5667, + "task_loss": 0.6200957298278809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43517014384269714, + "epoch": 4.79, + "learning_rate": 2.6043956043956046e-05, + "loss": 0.4715, + "step": 5668, + "task_loss": 0.3498627543449402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45215046405792236, + "epoch": 4.79, + "learning_rate": 2.6039729501267962e-05, + "loss": 0.4349, + "step": 5669, + "task_loss": 0.1911943554878235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.287114679813385, + "epoch": 4.79, + "learning_rate": 2.6035502958579882e-05, + "loss": 0.5109, + "step": 5670, + "task_loss": 0.7816593050956726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.960198163986206, + "epoch": 4.79, + "learning_rate": 2.6031276415891802e-05, + "loss": 0.844, + "step": 5671, + "task_loss": 1.0055745840072632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.74333655834198, + "epoch": 4.79, + "learning_rate": 2.6027049873203718e-05, + "loss": 0.5266, + "step": 5672, + "task_loss": 0.44717124104499817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37837931513786316, + "epoch": 4.79, + "learning_rate": 2.6022823330515638e-05, + "loss": 0.5604, + "step": 5673, + "task_loss": 0.9327839016914368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35657021403312683, + "epoch": 4.8, + "learning_rate": 2.601859678782756e-05, + "loss": 0.6562, + "step": 5674, + "task_loss": 0.4927375912666321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44908827543258667, + "epoch": 4.8, + "learning_rate": 2.6014370245139474e-05, + "loss": 0.5025, + "step": 5675, + "task_loss": 0.5313500165939331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24783718585968018, + "epoch": 4.8, + "learning_rate": 2.6010143702451394e-05, + "loss": 0.4692, + "step": 5676, + "task_loss": 0.2493368536233902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6368770003318787, + "epoch": 4.8, + "learning_rate": 2.6005917159763317e-05, + "loss": 0.4334, + "step": 5677, + "task_loss": 0.7560250759124756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.587492823600769, + "epoch": 4.8, + "learning_rate": 2.6001690617075237e-05, + "loss": 0.6201, + "step": 5678, + "task_loss": 0.7745904326438904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48275214433670044, + "epoch": 4.8, + "learning_rate": 2.599746407438715e-05, + "loss": 0.4779, + "step": 5679, + "task_loss": 1.784326434135437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.526221513748169, + "epoch": 4.8, + "learning_rate": 2.5993237531699073e-05, + "loss": 0.5545, + "step": 5680, + "task_loss": 0.9344443082809448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.525942325592041, + "epoch": 4.8, + "learning_rate": 2.5989010989010992e-05, + "loss": 0.5906, + "step": 5681, + "task_loss": 0.7862739562988281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5827715396881104, + "epoch": 4.8, + "learning_rate": 2.598478444632291e-05, + "loss": 0.6131, + "step": 5682, + "task_loss": 0.4872420132160187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24403181672096252, + "epoch": 4.8, + "learning_rate": 2.598055790363483e-05, + "loss": 0.41, + "step": 5683, + "task_loss": 0.06736335903406143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.913641631603241, + "epoch": 4.8, + "learning_rate": 2.5976331360946748e-05, + "loss": 0.6992, + "step": 5684, + "task_loss": 0.8783570528030396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30338025093078613, + "epoch": 4.81, + "learning_rate": 2.5972104818258665e-05, + "loss": 0.4368, + "step": 5685, + "task_loss": 0.7445864677429199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6875220537185669, + "epoch": 4.81, + "learning_rate": 2.5967878275570584e-05, + "loss": 0.5046, + "step": 5686, + "task_loss": 1.198779582977295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3079240322113037, + "epoch": 4.81, + "learning_rate": 2.5963651732882504e-05, + "loss": 0.4434, + "step": 5687, + "task_loss": 0.24542582035064697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5229561924934387, + "epoch": 4.81, + "learning_rate": 2.595942519019442e-05, + "loss": 0.5732, + "step": 5688, + "task_loss": 0.464175820350647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7783998250961304, + "epoch": 4.81, + "learning_rate": 2.595519864750634e-05, + "loss": 0.5074, + "step": 5689, + "task_loss": 0.7924084663391113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3210090398788452, + "epoch": 4.81, + "learning_rate": 2.595097210481826e-05, + "loss": 0.3673, + "step": 5690, + "task_loss": 0.5120891332626343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4437573552131653, + "epoch": 4.81, + "learning_rate": 2.5946745562130183e-05, + "loss": 0.4692, + "step": 5691, + "task_loss": 1.708738088607788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.434333860874176, + "epoch": 4.81, + "learning_rate": 2.5942519019442096e-05, + "loss": 0.4943, + "step": 5692, + "task_loss": 1.3493313789367676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8483754396438599, + "epoch": 4.81, + "learning_rate": 2.5938292476754016e-05, + "loss": 0.7299, + "step": 5693, + "task_loss": 1.4283978939056396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5169681310653687, + "epoch": 4.81, + "learning_rate": 2.593406593406594e-05, + "loss": 0.575, + "step": 5694, + "task_loss": 0.705615222454071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.646710991859436, + "epoch": 4.81, + "learning_rate": 2.5929839391377852e-05, + "loss": 0.5696, + "step": 5695, + "task_loss": 0.9948090314865112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6141926050186157, + "epoch": 4.81, + "learning_rate": 2.592561284868977e-05, + "loss": 0.5006, + "step": 5696, + "task_loss": 0.6821855306625366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7578493356704712, + "epoch": 4.82, + "learning_rate": 2.5921386306001695e-05, + "loss": 0.5585, + "step": 5697, + "task_loss": 0.8133834600448608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30476129055023193, + "epoch": 4.82, + "learning_rate": 2.5917159763313608e-05, + "loss": 0.4544, + "step": 5698, + "task_loss": 1.0011470317840576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24370303750038147, + "epoch": 4.82, + "learning_rate": 2.591293322062553e-05, + "loss": 0.5116, + "step": 5699, + "task_loss": 0.9572376608848572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40835651755332947, + "epoch": 4.82, + "learning_rate": 2.590870667793745e-05, + "loss": 0.3912, + "step": 5700, + "task_loss": 0.5730369687080383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5113409757614136, + "epoch": 4.82, + "learning_rate": 2.5904480135249363e-05, + "loss": 0.4246, + "step": 5701, + "task_loss": 0.2606313228607178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4817047715187073, + "epoch": 4.82, + "learning_rate": 2.5900253592561287e-05, + "loss": 0.5348, + "step": 5702, + "task_loss": 0.3780542016029358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4873103201389313, + "epoch": 4.82, + "learning_rate": 2.5896027049873206e-05, + "loss": 0.4629, + "step": 5703, + "task_loss": 0.9705052971839905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0966441631317139, + "epoch": 4.82, + "learning_rate": 2.5891800507185123e-05, + "loss": 0.5141, + "step": 5704, + "task_loss": 0.5987086892127991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7557412981987, + "epoch": 4.82, + "learning_rate": 2.5887573964497042e-05, + "loss": 0.4665, + "step": 5705, + "task_loss": 0.24284757673740387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5400998592376709, + "epoch": 4.82, + "learning_rate": 2.5883347421808962e-05, + "loss": 0.607, + "step": 5706, + "task_loss": 0.4309907555580139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5115411877632141, + "epoch": 4.82, + "learning_rate": 2.5879120879120882e-05, + "loss": 0.771, + "step": 5707, + "task_loss": 0.6689947843551636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5601620674133301, + "epoch": 4.82, + "learning_rate": 2.5874894336432798e-05, + "loss": 0.5574, + "step": 5708, + "task_loss": 0.7265559434890747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7620468735694885, + "epoch": 4.83, + "learning_rate": 2.5870667793744718e-05, + "loss": 0.637, + "step": 5709, + "task_loss": 0.2640365660190582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42057105898857117, + "epoch": 4.83, + "learning_rate": 2.5866441251056638e-05, + "loss": 0.5882, + "step": 5710, + "task_loss": 0.6982191205024719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6694300174713135, + "epoch": 4.83, + "learning_rate": 2.5862214708368554e-05, + "loss": 0.5296, + "step": 5711, + "task_loss": 0.5387284159660339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7289752960205078, + "epoch": 4.83, + "learning_rate": 2.5857988165680474e-05, + "loss": 0.5318, + "step": 5712, + "task_loss": 0.16338476538658142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5404638051986694, + "epoch": 4.83, + "learning_rate": 2.5853761622992394e-05, + "loss": 0.6265, + "step": 5713, + "task_loss": 0.7374786138534546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5856848955154419, + "epoch": 4.83, + "learning_rate": 2.584953508030431e-05, + "loss": 0.4605, + "step": 5714, + "task_loss": 0.33530861139297485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8572466373443604, + "epoch": 4.83, + "learning_rate": 2.584530853761623e-05, + "loss": 0.5386, + "step": 5715, + "task_loss": 0.5358402729034424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4709377884864807, + "epoch": 4.83, + "learning_rate": 2.5841081994928153e-05, + "loss": 0.5637, + "step": 5716, + "task_loss": 0.35859978199005127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40097755193710327, + "epoch": 4.83, + "learning_rate": 2.5836855452240066e-05, + "loss": 0.6534, + "step": 5717, + "task_loss": 0.48819518089294434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15693235397338867, + "epoch": 4.83, + "learning_rate": 2.5832628909551985e-05, + "loss": 0.5329, + "step": 5718, + "task_loss": 0.3870006203651428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.611301600933075, + "epoch": 4.83, + "learning_rate": 2.582840236686391e-05, + "loss": 0.5204, + "step": 5719, + "task_loss": 0.3511592149734497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38320550322532654, + "epoch": 4.83, + "learning_rate": 2.582417582417583e-05, + "loss": 0.4812, + "step": 5720, + "task_loss": 0.5833104848861694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4239521622657776, + "epoch": 4.84, + "learning_rate": 2.5819949281487745e-05, + "loss": 0.4865, + "step": 5721, + "task_loss": 1.3155471086502075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.374227374792099, + "epoch": 4.84, + "learning_rate": 2.5815722738799664e-05, + "loss": 0.4848, + "step": 5722, + "task_loss": 0.155734583735466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3500721752643585, + "epoch": 4.84, + "learning_rate": 2.5811496196111584e-05, + "loss": 0.3175, + "step": 5723, + "task_loss": 0.07309938222169876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6808044910430908, + "epoch": 4.84, + "learning_rate": 2.58072696534235e-05, + "loss": 0.607, + "step": 5724, + "task_loss": 1.301774024963379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5031784772872925, + "epoch": 4.84, + "learning_rate": 2.580304311073542e-05, + "loss": 0.5237, + "step": 5725, + "task_loss": 0.9276639819145203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5277193188667297, + "epoch": 4.84, + "learning_rate": 2.579881656804734e-05, + "loss": 0.5116, + "step": 5726, + "task_loss": 0.7294089198112488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48168325424194336, + "epoch": 4.84, + "learning_rate": 2.5794590025359256e-05, + "loss": 0.5225, + "step": 5727, + "task_loss": 0.8994878530502319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3028010129928589, + "epoch": 4.84, + "learning_rate": 2.5790363482671176e-05, + "loss": 0.5108, + "step": 5728, + "task_loss": 0.41921886801719666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5681043863296509, + "epoch": 4.84, + "learning_rate": 2.5786136939983096e-05, + "loss": 0.4543, + "step": 5729, + "task_loss": 0.4027162194252014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5804601907730103, + "epoch": 4.84, + "learning_rate": 2.5781910397295012e-05, + "loss": 0.5915, + "step": 5730, + "task_loss": 0.2755051553249359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45864808559417725, + "epoch": 4.84, + "learning_rate": 2.5777683854606932e-05, + "loss": 0.5924, + "step": 5731, + "task_loss": 0.6574577689170837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5979812145233154, + "epoch": 4.84, + "learning_rate": 2.577345731191885e-05, + "loss": 0.5915, + "step": 5732, + "task_loss": 0.8693372011184692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5351882576942444, + "epoch": 4.85, + "learning_rate": 2.5769230769230768e-05, + "loss": 0.6481, + "step": 5733, + "task_loss": 1.0330792665481567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38662809133529663, + "epoch": 4.85, + "learning_rate": 2.5765004226542688e-05, + "loss": 0.3976, + "step": 5734, + "task_loss": 0.8479583859443665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4904678165912628, + "epoch": 4.85, + "learning_rate": 2.5760777683854607e-05, + "loss": 0.5935, + "step": 5735, + "task_loss": 1.1915929317474365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18958629667758942, + "epoch": 4.85, + "learning_rate": 2.575655114116653e-05, + "loss": 0.5473, + "step": 5736, + "task_loss": 0.178871750831604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3452599048614502, + "epoch": 4.85, + "learning_rate": 2.5752324598478444e-05, + "loss": 0.3807, + "step": 5737, + "task_loss": 0.15823009610176086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6254770755767822, + "epoch": 4.85, + "learning_rate": 2.5748098055790367e-05, + "loss": 0.5697, + "step": 5738, + "task_loss": 0.8473315238952637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45445820689201355, + "epoch": 4.85, + "learning_rate": 2.5743871513102286e-05, + "loss": 0.4999, + "step": 5739, + "task_loss": 0.42688536643981934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.457449734210968, + "epoch": 4.85, + "learning_rate": 2.57396449704142e-05, + "loss": 0.5356, + "step": 5740, + "task_loss": 0.4375111758708954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7330834865570068, + "epoch": 4.85, + "learning_rate": 2.5735418427726123e-05, + "loss": 0.5127, + "step": 5741, + "task_loss": 0.7580153942108154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44900187849998474, + "epoch": 4.85, + "learning_rate": 2.5731191885038042e-05, + "loss": 0.4143, + "step": 5742, + "task_loss": 0.7130560278892517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4757002890110016, + "epoch": 4.85, + "learning_rate": 2.5726965342349955e-05, + "loss": 0.5065, + "step": 5743, + "task_loss": 0.08617472648620605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8550693392753601, + "epoch": 4.85, + "learning_rate": 2.572273879966188e-05, + "loss": 0.5284, + "step": 5744, + "task_loss": 0.9987162947654724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36152395606040955, + "epoch": 4.86, + "learning_rate": 2.5718512256973798e-05, + "loss": 0.4245, + "step": 5745, + "task_loss": 0.45784759521484375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6456335783004761, + "epoch": 4.86, + "learning_rate": 2.5714285714285714e-05, + "loss": 0.5568, + "step": 5746, + "task_loss": 1.1442067623138428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35833001136779785, + "epoch": 4.86, + "learning_rate": 2.5710059171597634e-05, + "loss": 0.5218, + "step": 5747, + "task_loss": 0.6463460326194763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4828640818595886, + "epoch": 4.86, + "learning_rate": 2.5705832628909554e-05, + "loss": 0.4198, + "step": 5748, + "task_loss": 1.336163878440857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5381168723106384, + "epoch": 4.86, + "learning_rate": 2.5701606086221474e-05, + "loss": 0.483, + "step": 5749, + "task_loss": 1.245650291442871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6171397566795349, + "epoch": 4.86, + "learning_rate": 2.569737954353339e-05, + "loss": 0.5318, + "step": 5750, + "task_loss": 0.8073712587356567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7495328187942505, + "epoch": 4.86, + "learning_rate": 2.569315300084531e-05, + "loss": 0.6927, + "step": 5751, + "task_loss": 0.7202289700508118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3827773332595825, + "epoch": 4.86, + "learning_rate": 2.568892645815723e-05, + "loss": 0.6827, + "step": 5752, + "task_loss": 1.589543104171753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7276687622070312, + "epoch": 4.86, + "learning_rate": 2.5684699915469146e-05, + "loss": 0.5689, + "step": 5753, + "task_loss": 0.42279499769210815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.355838418006897, + "epoch": 4.86, + "learning_rate": 2.5680473372781066e-05, + "loss": 0.4216, + "step": 5754, + "task_loss": 0.6108952760696411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5403056144714355, + "epoch": 4.86, + "learning_rate": 2.567624683009299e-05, + "loss": 0.5262, + "step": 5755, + "task_loss": 1.629965901374817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4730292558670044, + "epoch": 4.87, + "learning_rate": 2.56720202874049e-05, + "loss": 0.4881, + "step": 5756, + "task_loss": 1.3579230308532715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4675934910774231, + "epoch": 4.87, + "learning_rate": 2.566779374471682e-05, + "loss": 0.3796, + "step": 5757, + "task_loss": 0.23253118991851807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2809135913848877, + "epoch": 4.87, + "learning_rate": 2.5663567202028745e-05, + "loss": 0.4844, + "step": 5758, + "task_loss": 0.7059835195541382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6576655507087708, + "epoch": 4.87, + "learning_rate": 2.5659340659340658e-05, + "loss": 0.5792, + "step": 5759, + "task_loss": 1.1060556173324585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44497209787368774, + "epoch": 4.87, + "learning_rate": 2.5655114116652577e-05, + "loss": 0.5469, + "step": 5760, + "task_loss": 0.20644617080688477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31369468569755554, + "epoch": 4.87, + "learning_rate": 2.56508875739645e-05, + "loss": 0.4361, + "step": 5761, + "task_loss": 0.09243584424257278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24879266321659088, + "epoch": 4.87, + "learning_rate": 2.5646661031276413e-05, + "loss": 0.3747, + "step": 5762, + "task_loss": 0.1869252771139145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49228155612945557, + "epoch": 4.87, + "learning_rate": 2.5642434488588336e-05, + "loss": 0.4204, + "step": 5763, + "task_loss": 0.16491226851940155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6650729179382324, + "epoch": 4.87, + "learning_rate": 2.5638207945900256e-05, + "loss": 0.544, + "step": 5764, + "task_loss": 0.8105949759483337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6996420621871948, + "epoch": 4.87, + "learning_rate": 2.5633981403212176e-05, + "loss": 0.4555, + "step": 5765, + "task_loss": 0.4945085942745209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8733053207397461, + "epoch": 4.87, + "learning_rate": 2.5629754860524092e-05, + "loss": 0.5221, + "step": 5766, + "task_loss": 1.4503761529922485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5592821836471558, + "epoch": 4.87, + "learning_rate": 2.5625528317836012e-05, + "loss": 0.5416, + "step": 5767, + "task_loss": 0.9621143937110901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5700168013572693, + "epoch": 4.88, + "learning_rate": 2.5621301775147932e-05, + "loss": 0.4748, + "step": 5768, + "task_loss": 1.0551432371139526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4817230701446533, + "epoch": 4.88, + "learning_rate": 2.5617075232459848e-05, + "loss": 0.5339, + "step": 5769, + "task_loss": 0.27261602878570557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.353694885969162, + "epoch": 4.88, + "learning_rate": 2.5612848689771768e-05, + "loss": 0.4851, + "step": 5770, + "task_loss": 0.24526077508926392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2565145492553711, + "epoch": 4.88, + "learning_rate": 2.5608622147083688e-05, + "loss": 0.5508, + "step": 5771, + "task_loss": 0.7748639583587646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4367571771144867, + "epoch": 4.88, + "learning_rate": 2.5604395604395604e-05, + "loss": 0.5058, + "step": 5772, + "task_loss": 1.2265058755874634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6340959072113037, + "epoch": 4.88, + "learning_rate": 2.5600169061707524e-05, + "loss": 0.4916, + "step": 5773, + "task_loss": 0.6340881586074829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37358057498931885, + "epoch": 4.88, + "learning_rate": 2.5595942519019443e-05, + "loss": 0.4644, + "step": 5774, + "task_loss": 0.43144452571868896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23732715845108032, + "epoch": 4.88, + "learning_rate": 2.559171597633136e-05, + "loss": 0.3145, + "step": 5775, + "task_loss": 0.34335991740226746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3814034163951874, + "epoch": 4.88, + "learning_rate": 2.558748943364328e-05, + "loss": 0.4416, + "step": 5776, + "task_loss": 0.9832383990287781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.450017511844635, + "epoch": 4.88, + "learning_rate": 2.55832628909552e-05, + "loss": 0.5042, + "step": 5777, + "task_loss": 0.6779274940490723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7310442924499512, + "epoch": 4.88, + "learning_rate": 2.5579036348267122e-05, + "loss": 0.5786, + "step": 5778, + "task_loss": 1.2296597957611084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30241549015045166, + "epoch": 4.88, + "learning_rate": 2.5574809805579035e-05, + "loss": 0.3852, + "step": 5779, + "task_loss": 0.8809140920639038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6059231758117676, + "epoch": 4.89, + "learning_rate": 2.557058326289096e-05, + "loss": 0.4327, + "step": 5780, + "task_loss": 1.1398409605026245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6389366388320923, + "epoch": 4.89, + "learning_rate": 2.5566356720202878e-05, + "loss": 0.5166, + "step": 5781, + "task_loss": 1.318400502204895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5366359949111938, + "epoch": 4.89, + "learning_rate": 2.556213017751479e-05, + "loss": 0.6158, + "step": 5782, + "task_loss": 0.4537641704082489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4648832082748413, + "epoch": 4.89, + "learning_rate": 2.5557903634826714e-05, + "loss": 0.5396, + "step": 5783, + "task_loss": 0.905745804309845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4635361433029175, + "epoch": 4.89, + "learning_rate": 2.5553677092138634e-05, + "loss": 0.3974, + "step": 5784, + "task_loss": 1.1852481365203857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5991666913032532, + "epoch": 4.89, + "learning_rate": 2.554945054945055e-05, + "loss": 0.4264, + "step": 5785, + "task_loss": 1.1749745607376099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7029132843017578, + "epoch": 4.89, + "learning_rate": 2.554522400676247e-05, + "loss": 0.5258, + "step": 5786, + "task_loss": 0.6575952768325806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.2517204284667969, + "epoch": 4.89, + "learning_rate": 2.554099746407439e-05, + "loss": 0.6655, + "step": 5787, + "task_loss": 0.7345222234725952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39583903551101685, + "epoch": 4.89, + "learning_rate": 2.5536770921386306e-05, + "loss": 0.6035, + "step": 5788, + "task_loss": 0.8615870475769043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5294861793518066, + "epoch": 4.89, + "learning_rate": 2.5532544378698226e-05, + "loss": 0.3969, + "step": 5789, + "task_loss": 0.543440043926239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49099695682525635, + "epoch": 4.89, + "learning_rate": 2.5528317836010146e-05, + "loss": 0.5556, + "step": 5790, + "task_loss": 0.574594259262085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35489338636398315, + "epoch": 4.89, + "learning_rate": 2.5524091293322062e-05, + "loss": 0.4158, + "step": 5791, + "task_loss": 0.5920946002006531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36102455854415894, + "epoch": 4.9, + "learning_rate": 2.5519864750633982e-05, + "loss": 0.4099, + "step": 5792, + "task_loss": 0.1813107430934906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3581339120864868, + "epoch": 4.9, + "learning_rate": 2.55156382079459e-05, + "loss": 0.6384, + "step": 5793, + "task_loss": 1.0016592741012573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25984156131744385, + "epoch": 4.9, + "learning_rate": 2.551141166525782e-05, + "loss": 0.388, + "step": 5794, + "task_loss": 0.9038347005844116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5782009959220886, + "epoch": 4.9, + "learning_rate": 2.5507185122569738e-05, + "loss": 0.5071, + "step": 5795, + "task_loss": 0.4652150571346283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4317368268966675, + "epoch": 4.9, + "learning_rate": 2.5502958579881657e-05, + "loss": 0.4976, + "step": 5796, + "task_loss": 0.41916972398757935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9402254819869995, + "epoch": 4.9, + "learning_rate": 2.549873203719358e-05, + "loss": 0.6321, + "step": 5797, + "task_loss": 0.696761965751648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3938978910446167, + "epoch": 4.9, + "learning_rate": 2.5494505494505493e-05, + "loss": 0.5678, + "step": 5798, + "task_loss": 0.7342822551727295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4212161600589752, + "epoch": 4.9, + "learning_rate": 2.5490278951817413e-05, + "loss": 0.5359, + "step": 5799, + "task_loss": 0.6637683510780334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4594886302947998, + "epoch": 4.9, + "learning_rate": 2.5486052409129336e-05, + "loss": 0.5406, + "step": 5800, + "task_loss": 0.6883007884025574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5758783221244812, + "epoch": 4.9, + "learning_rate": 2.548182586644125e-05, + "loss": 0.4502, + "step": 5801, + "task_loss": 1.2055011987686157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.861035943031311, + "epoch": 4.9, + "learning_rate": 2.5477599323753172e-05, + "loss": 0.5318, + "step": 5802, + "task_loss": 0.8397704362869263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4103643596172333, + "epoch": 4.9, + "learning_rate": 2.5473372781065092e-05, + "loss": 0.47, + "step": 5803, + "task_loss": 0.5728910565376282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4958900809288025, + "epoch": 4.91, + "learning_rate": 2.5469146238377005e-05, + "loss": 0.6223, + "step": 5804, + "task_loss": 0.928153395652771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7503769993782043, + "epoch": 4.91, + "learning_rate": 2.5464919695688928e-05, + "loss": 0.5743, + "step": 5805, + "task_loss": 0.4201495945453644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44930627942085266, + "epoch": 4.91, + "learning_rate": 2.5460693153000848e-05, + "loss": 0.4695, + "step": 5806, + "task_loss": 0.40101706981658936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5396455526351929, + "epoch": 4.91, + "learning_rate": 2.5456466610312768e-05, + "loss": 0.4015, + "step": 5807, + "task_loss": 0.7356283664703369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47926023602485657, + "epoch": 4.91, + "learning_rate": 2.5452240067624684e-05, + "loss": 0.665, + "step": 5808, + "task_loss": 0.41746148467063904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5205321907997131, + "epoch": 4.91, + "learning_rate": 2.5448013524936604e-05, + "loss": 0.4636, + "step": 5809, + "task_loss": 0.9346333742141724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33122286200523376, + "epoch": 4.91, + "learning_rate": 2.5443786982248524e-05, + "loss": 0.4602, + "step": 5810, + "task_loss": 0.8968300223350525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.361167848110199, + "epoch": 4.91, + "learning_rate": 2.543956043956044e-05, + "loss": 0.5609, + "step": 5811, + "task_loss": 0.6878184080123901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4316611886024475, + "epoch": 4.91, + "learning_rate": 2.543533389687236e-05, + "loss": 0.4646, + "step": 5812, + "task_loss": 0.28144291043281555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3572077751159668, + "epoch": 4.91, + "learning_rate": 2.543110735418428e-05, + "loss": 0.4409, + "step": 5813, + "task_loss": 0.7670496106147766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44395148754119873, + "epoch": 4.91, + "learning_rate": 2.5426880811496196e-05, + "loss": 0.5582, + "step": 5814, + "task_loss": 0.4610663652420044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45230865478515625, + "epoch": 4.91, + "learning_rate": 2.5422654268808115e-05, + "loss": 0.5096, + "step": 5815, + "task_loss": 0.9117465019226074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43482837080955505, + "epoch": 4.92, + "learning_rate": 2.5418427726120035e-05, + "loss": 0.3758, + "step": 5816, + "task_loss": 0.8503986597061157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35342341661453247, + "epoch": 4.92, + "learning_rate": 2.541420118343195e-05, + "loss": 0.3736, + "step": 5817, + "task_loss": 0.7412182092666626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48658478260040283, + "epoch": 4.92, + "learning_rate": 2.540997464074387e-05, + "loss": 0.51, + "step": 5818, + "task_loss": 0.3380177617073059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40707266330718994, + "epoch": 4.92, + "learning_rate": 2.5405748098055794e-05, + "loss": 0.399, + "step": 5819, + "task_loss": 0.25088635087013245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4398203492164612, + "epoch": 4.92, + "learning_rate": 2.5401521555367707e-05, + "loss": 0.5561, + "step": 5820, + "task_loss": 0.499479740858078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5047560930252075, + "epoch": 4.92, + "learning_rate": 2.5397295012679627e-05, + "loss": 0.4849, + "step": 5821, + "task_loss": 1.1149452924728394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3739008903503418, + "epoch": 4.92, + "learning_rate": 2.539306846999155e-05, + "loss": 0.48, + "step": 5822, + "task_loss": 0.9758245348930359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7033113241195679, + "epoch": 4.92, + "learning_rate": 2.538884192730347e-05, + "loss": 0.4864, + "step": 5823, + "task_loss": 0.9139550924301147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.586179256439209, + "epoch": 4.92, + "learning_rate": 2.5384615384615383e-05, + "loss": 0.4724, + "step": 5824, + "task_loss": 0.630351185798645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5750783681869507, + "epoch": 4.92, + "learning_rate": 2.5380388841927306e-05, + "loss": 0.4485, + "step": 5825, + "task_loss": 0.2937678396701813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8464914560317993, + "epoch": 4.92, + "learning_rate": 2.5376162299239226e-05, + "loss": 0.5393, + "step": 5826, + "task_loss": 1.059510588645935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4885863661766052, + "epoch": 4.93, + "learning_rate": 2.5371935756551142e-05, + "loss": 0.589, + "step": 5827, + "task_loss": 0.4706907868385315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31396862864494324, + "epoch": 4.93, + "learning_rate": 2.5367709213863062e-05, + "loss": 0.4182, + "step": 5828, + "task_loss": 0.18040288984775543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.496757447719574, + "epoch": 4.93, + "learning_rate": 2.536348267117498e-05, + "loss": 0.5786, + "step": 5829, + "task_loss": 1.1320637464523315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5502353310585022, + "epoch": 4.93, + "learning_rate": 2.5359256128486898e-05, + "loss": 0.4745, + "step": 5830, + "task_loss": 0.17524020373821259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6988099217414856, + "epoch": 4.93, + "learning_rate": 2.5355029585798818e-05, + "loss": 0.5978, + "step": 5831, + "task_loss": 0.525827944278717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48440343141555786, + "epoch": 4.93, + "learning_rate": 2.5350803043110737e-05, + "loss": 0.4035, + "step": 5832, + "task_loss": 0.9719178676605225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49175530672073364, + "epoch": 4.93, + "learning_rate": 2.5346576500422654e-05, + "loss": 0.4928, + "step": 5833, + "task_loss": 0.4114224910736084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3087288737297058, + "epoch": 4.93, + "learning_rate": 2.5342349957734574e-05, + "loss": 0.4708, + "step": 5834, + "task_loss": 0.21083305776119232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5517680644989014, + "epoch": 4.93, + "learning_rate": 2.5338123415046493e-05, + "loss": 0.6133, + "step": 5835, + "task_loss": 0.46359625458717346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.664401650428772, + "epoch": 4.93, + "learning_rate": 2.5333896872358413e-05, + "loss": 0.5442, + "step": 5836, + "task_loss": 1.2301784753799438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19935551285743713, + "epoch": 4.93, + "learning_rate": 2.532967032967033e-05, + "loss": 0.4321, + "step": 5837, + "task_loss": 0.23818977177143097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5411819815635681, + "epoch": 4.93, + "learning_rate": 2.532544378698225e-05, + "loss": 0.5597, + "step": 5838, + "task_loss": 0.728217601776123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37624186277389526, + "epoch": 4.94, + "learning_rate": 2.5321217244294172e-05, + "loss": 0.4684, + "step": 5839, + "task_loss": 0.16302083432674408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.308685839176178, + "epoch": 4.94, + "learning_rate": 2.5316990701606085e-05, + "loss": 0.4068, + "step": 5840, + "task_loss": 0.19114889204502106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37312954664230347, + "epoch": 4.94, + "learning_rate": 2.5312764158918005e-05, + "loss": 0.4076, + "step": 5841, + "task_loss": 0.5269535183906555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.446185827255249, + "epoch": 4.94, + "learning_rate": 2.5308537616229928e-05, + "loss": 0.4988, + "step": 5842, + "task_loss": 0.20380905270576477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4144616723060608, + "epoch": 4.94, + "learning_rate": 2.530431107354184e-05, + "loss": 0.5185, + "step": 5843, + "task_loss": 0.38356897234916687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9075977206230164, + "epoch": 4.94, + "learning_rate": 2.5300084530853764e-05, + "loss": 0.5909, + "step": 5844, + "task_loss": 1.2336329221725464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25992679595947266, + "epoch": 4.94, + "learning_rate": 2.5295857988165684e-05, + "loss": 0.4263, + "step": 5845, + "task_loss": 0.09076137840747833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5832650065422058, + "epoch": 4.94, + "learning_rate": 2.5291631445477597e-05, + "loss": 0.4467, + "step": 5846, + "task_loss": 0.5090808868408203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34500059485435486, + "epoch": 4.94, + "learning_rate": 2.528740490278952e-05, + "loss": 0.5137, + "step": 5847, + "task_loss": 0.16382452845573425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.56105637550354, + "epoch": 4.94, + "learning_rate": 2.528317836010144e-05, + "loss": 0.562, + "step": 5848, + "task_loss": 1.9156055450439453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5462300181388855, + "epoch": 4.94, + "learning_rate": 2.5278951817413356e-05, + "loss": 0.4513, + "step": 5849, + "task_loss": 0.2994759678840637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45235100388526917, + "epoch": 4.94, + "learning_rate": 2.5274725274725276e-05, + "loss": 0.5395, + "step": 5850, + "task_loss": 1.000086784362793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35647204518318176, + "epoch": 4.95, + "learning_rate": 2.5270498732037196e-05, + "loss": 0.5548, + "step": 5851, + "task_loss": 0.16849172115325928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5011870861053467, + "epoch": 4.95, + "learning_rate": 2.5266272189349115e-05, + "loss": 0.6782, + "step": 5852, + "task_loss": 1.1982852220535278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3965410888195038, + "epoch": 4.95, + "learning_rate": 2.526204564666103e-05, + "loss": 0.4441, + "step": 5853, + "task_loss": 0.2527925670146942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3198370337486267, + "epoch": 4.95, + "learning_rate": 2.525781910397295e-05, + "loss": 0.5133, + "step": 5854, + "task_loss": 0.5387527942657471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3331337869167328, + "epoch": 4.95, + "learning_rate": 2.525359256128487e-05, + "loss": 0.5982, + "step": 5855, + "task_loss": 0.40613484382629395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35700592398643494, + "epoch": 4.95, + "learning_rate": 2.5249366018596787e-05, + "loss": 0.4631, + "step": 5856, + "task_loss": 0.14375852048397064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5817124843597412, + "epoch": 4.95, + "learning_rate": 2.5245139475908707e-05, + "loss": 0.4992, + "step": 5857, + "task_loss": 0.7463344931602478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6648348569869995, + "epoch": 4.95, + "learning_rate": 2.5240912933220627e-05, + "loss": 0.6135, + "step": 5858, + "task_loss": 1.1142513751983643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40841737389564514, + "epoch": 4.95, + "learning_rate": 2.5236686390532543e-05, + "loss": 0.6678, + "step": 5859, + "task_loss": 0.6890674829483032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47071123123168945, + "epoch": 4.95, + "learning_rate": 2.5232459847844463e-05, + "loss": 0.3825, + "step": 5860, + "task_loss": 0.6409825086593628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6167083978652954, + "epoch": 4.95, + "learning_rate": 2.5228233305156386e-05, + "loss": 0.6933, + "step": 5861, + "task_loss": 0.9716223478317261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28745535016059875, + "epoch": 4.95, + "learning_rate": 2.52240067624683e-05, + "loss": 0.4906, + "step": 5862, + "task_loss": 0.04861461743712425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3605726659297943, + "epoch": 4.96, + "learning_rate": 2.521978021978022e-05, + "loss": 0.5034, + "step": 5863, + "task_loss": 0.26824408769607544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3527558445930481, + "epoch": 4.96, + "learning_rate": 2.5215553677092142e-05, + "loss": 0.5126, + "step": 5864, + "task_loss": 0.31379178166389465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35328710079193115, + "epoch": 4.96, + "learning_rate": 2.5211327134404062e-05, + "loss": 0.5614, + "step": 5865, + "task_loss": 1.0871851444244385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4643189311027527, + "epoch": 4.96, + "learning_rate": 2.5207100591715978e-05, + "loss": 0.4109, + "step": 5866, + "task_loss": 0.4647694230079651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48734110593795776, + "epoch": 4.96, + "learning_rate": 2.5202874049027898e-05, + "loss": 0.5673, + "step": 5867, + "task_loss": 1.3344870805740356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45599859952926636, + "epoch": 4.96, + "learning_rate": 2.5198647506339818e-05, + "loss": 0.4393, + "step": 5868, + "task_loss": 0.8248142600059509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7107944488525391, + "epoch": 4.96, + "learning_rate": 2.5194420963651734e-05, + "loss": 0.687, + "step": 5869, + "task_loss": 1.3206456899642944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5012810230255127, + "epoch": 4.96, + "learning_rate": 2.5190194420963654e-05, + "loss": 0.5362, + "step": 5870, + "task_loss": 1.2237449884414673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30335938930511475, + "epoch": 4.96, + "learning_rate": 2.5185967878275573e-05, + "loss": 0.3399, + "step": 5871, + "task_loss": 0.13658776879310608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6419541835784912, + "epoch": 4.96, + "learning_rate": 2.518174133558749e-05, + "loss": 0.5729, + "step": 5872, + "task_loss": 0.7049633264541626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6737095713615417, + "epoch": 4.96, + "learning_rate": 2.517751479289941e-05, + "loss": 0.3996, + "step": 5873, + "task_loss": 0.5167071223258972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38763627409935, + "epoch": 4.96, + "learning_rate": 2.517328825021133e-05, + "loss": 0.4745, + "step": 5874, + "task_loss": 0.4651373624801636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.506541907787323, + "epoch": 4.97, + "learning_rate": 2.5169061707523246e-05, + "loss": 0.4901, + "step": 5875, + "task_loss": 0.9345484375953674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7523743510246277, + "epoch": 4.97, + "learning_rate": 2.5164835164835165e-05, + "loss": 0.5872, + "step": 5876, + "task_loss": 0.2797583341598511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2746802568435669, + "epoch": 4.97, + "learning_rate": 2.5160608622147085e-05, + "loss": 0.5039, + "step": 5877, + "task_loss": 0.8425626158714294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2547357678413391, + "epoch": 4.97, + "learning_rate": 2.5156382079459e-05, + "loss": 0.5076, + "step": 5878, + "task_loss": 0.5859923362731934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9058804512023926, + "epoch": 4.97, + "learning_rate": 2.515215553677092e-05, + "loss": 0.7911, + "step": 5879, + "task_loss": 1.2105085849761963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3001091778278351, + "epoch": 4.97, + "learning_rate": 2.514792899408284e-05, + "loss": 0.3576, + "step": 5880, + "task_loss": 0.33678141236305237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5725070238113403, + "epoch": 4.97, + "learning_rate": 2.5143702451394764e-05, + "loss": 0.6149, + "step": 5881, + "task_loss": 1.29972505569458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4795590937137604, + "epoch": 4.97, + "learning_rate": 2.5139475908706677e-05, + "loss": 0.5009, + "step": 5882, + "task_loss": 0.8522188663482666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4290587306022644, + "epoch": 4.97, + "learning_rate": 2.51352493660186e-05, + "loss": 0.4123, + "step": 5883, + "task_loss": 0.7262642979621887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4498727321624756, + "epoch": 4.97, + "learning_rate": 2.513102282333052e-05, + "loss": 0.5662, + "step": 5884, + "task_loss": 0.328357070684433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4463154673576355, + "epoch": 4.97, + "learning_rate": 2.5126796280642433e-05, + "loss": 0.4233, + "step": 5885, + "task_loss": 0.628167986869812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4322727918624878, + "epoch": 4.97, + "learning_rate": 2.5122569737954356e-05, + "loss": 0.3901, + "step": 5886, + "task_loss": 0.5687154531478882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.387350469827652, + "epoch": 4.98, + "learning_rate": 2.5118343195266276e-05, + "loss": 0.4297, + "step": 5887, + "task_loss": 0.3951967656612396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9616945385932922, + "epoch": 4.98, + "learning_rate": 2.511411665257819e-05, + "loss": 0.5759, + "step": 5888, + "task_loss": 0.7422794699668884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2975861728191376, + "epoch": 4.98, + "learning_rate": 2.5109890109890112e-05, + "loss": 0.4499, + "step": 5889, + "task_loss": 0.2681705951690674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6026228666305542, + "epoch": 4.98, + "learning_rate": 2.510566356720203e-05, + "loss": 0.4986, + "step": 5890, + "task_loss": 0.708349347114563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3391422927379608, + "epoch": 4.98, + "learning_rate": 2.5101437024513948e-05, + "loss": 0.4834, + "step": 5891, + "task_loss": 0.35420122742652893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5993404388427734, + "epoch": 4.98, + "learning_rate": 2.5097210481825868e-05, + "loss": 0.6312, + "step": 5892, + "task_loss": 0.2654555141925812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5072178840637207, + "epoch": 4.98, + "learning_rate": 2.5092983939137787e-05, + "loss": 0.4789, + "step": 5893, + "task_loss": 0.23766012489795685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8217976689338684, + "epoch": 4.98, + "learning_rate": 2.5088757396449707e-05, + "loss": 0.5146, + "step": 5894, + "task_loss": 1.3266856670379639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3787155747413635, + "epoch": 4.98, + "learning_rate": 2.5084530853761623e-05, + "loss": 0.5612, + "step": 5895, + "task_loss": 0.7959149479866028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7610986232757568, + "epoch": 4.98, + "learning_rate": 2.5080304311073543e-05, + "loss": 0.5394, + "step": 5896, + "task_loss": 0.5654633641242981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4275207221508026, + "epoch": 4.98, + "learning_rate": 2.5076077768385463e-05, + "loss": 0.4836, + "step": 5897, + "task_loss": 0.4208388030529022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3316773772239685, + "epoch": 4.99, + "learning_rate": 2.507185122569738e-05, + "loss": 0.3813, + "step": 5898, + "task_loss": 0.47392022609710693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5100721716880798, + "epoch": 4.99, + "learning_rate": 2.50676246830093e-05, + "loss": 0.4499, + "step": 5899, + "task_loss": 0.7384151220321655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7506421804428101, + "epoch": 4.99, + "learning_rate": 2.506339814032122e-05, + "loss": 0.5957, + "step": 5900, + "task_loss": 0.9187194108963013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42261719703674316, + "epoch": 4.99, + "learning_rate": 2.5059171597633135e-05, + "loss": 0.4444, + "step": 5901, + "task_loss": 0.4331877827644348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8414658904075623, + "epoch": 4.99, + "learning_rate": 2.5054945054945055e-05, + "loss": 0.6473, + "step": 5902, + "task_loss": 1.207355260848999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6883753538131714, + "epoch": 4.99, + "learning_rate": 2.5050718512256978e-05, + "loss": 0.5487, + "step": 5903, + "task_loss": 0.25709763169288635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48178380727767944, + "epoch": 4.99, + "learning_rate": 2.504649196956889e-05, + "loss": 0.4289, + "step": 5904, + "task_loss": 0.5272418856620789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5265001058578491, + "epoch": 4.99, + "learning_rate": 2.504226542688081e-05, + "loss": 0.4962, + "step": 5905, + "task_loss": 0.6459428668022156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34180378913879395, + "epoch": 4.99, + "learning_rate": 2.5038038884192734e-05, + "loss": 0.4022, + "step": 5906, + "task_loss": 0.6246843338012695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5612274408340454, + "epoch": 4.99, + "learning_rate": 2.5033812341504647e-05, + "loss": 0.6222, + "step": 5907, + "task_loss": 0.9123996496200562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5337389707565308, + "epoch": 4.99, + "learning_rate": 2.502958579881657e-05, + "loss": 0.6182, + "step": 5908, + "task_loss": 0.5444350242614746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4110601246356964, + "epoch": 4.99, + "learning_rate": 2.502535925612849e-05, + "loss": 0.554, + "step": 5909, + "task_loss": 1.1575924158096313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7683933973312378, + "epoch": 5.0, + "learning_rate": 2.502113271344041e-05, + "loss": 0.6302, + "step": 5910, + "task_loss": 0.9224808216094971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31058940291404724, + "epoch": 5.0, + "learning_rate": 2.5016906170752326e-05, + "loss": 0.417, + "step": 5911, + "task_loss": 1.0095516443252563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8734809756278992, + "epoch": 5.0, + "learning_rate": 2.5012679628064245e-05, + "loss": 0.5721, + "step": 5912, + "task_loss": 0.343243271112442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5232601165771484, + "epoch": 5.0, + "learning_rate": 2.5008453085376165e-05, + "loss": 0.4468, + "step": 5913, + "task_loss": 0.28599193692207336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5779330134391785, + "epoch": 5.0, + "learning_rate": 2.500422654268808e-05, + "loss": 0.4849, + "step": 5914, + "task_loss": 0.6249581575393677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36032405495643616, + "epoch": 5.0, + "learning_rate": 2.5e-05, + "loss": 0.4809, + "step": 5915, + "task_loss": 0.5216179490089417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6082985401153564, + "epoch": 5.0, + "learning_rate": 2.4995773457311918e-05, + "loss": 0.9102, + "step": 5916, + "task_loss": 0.6683598160743713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8055363893508911, + "epoch": 5.0, + "learning_rate": 2.499154691462384e-05, + "loss": 0.492, + "step": 5917, + "task_loss": 0.9046040177345276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5577858686447144, + "epoch": 5.0, + "learning_rate": 2.4987320371935757e-05, + "loss": 0.5702, + "step": 5918, + "task_loss": 1.107552170753479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33932915329933167, + "epoch": 5.0, + "learning_rate": 2.4983093829247677e-05, + "loss": 0.4536, + "step": 5919, + "task_loss": 0.34433531761169434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.719271183013916, + "epoch": 5.0, + "learning_rate": 2.4978867286559597e-05, + "loss": 0.6003, + "step": 5920, + "task_loss": 0.6446596384048462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7384810447692871, + "epoch": 5.01, + "learning_rate": 2.4974640743871513e-05, + "loss": 0.5199, + "step": 5921, + "task_loss": 0.3789003789424896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3647576868534088, + "epoch": 5.01, + "learning_rate": 2.4970414201183433e-05, + "loss": 0.5021, + "step": 5922, + "task_loss": 0.6205793619155884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6078684329986572, + "epoch": 5.01, + "learning_rate": 2.4966187658495352e-05, + "loss": 0.6143, + "step": 5923, + "task_loss": 0.6133117079734802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3312162458896637, + "epoch": 5.01, + "learning_rate": 2.496196111580727e-05, + "loss": 0.3951, + "step": 5924, + "task_loss": 0.554440975189209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5120965242385864, + "epoch": 5.01, + "learning_rate": 2.4957734573119192e-05, + "loss": 0.4515, + "step": 5925, + "task_loss": 0.6741136312484741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25445348024368286, + "epoch": 5.01, + "learning_rate": 2.4953508030431108e-05, + "loss": 0.4508, + "step": 5926, + "task_loss": 0.8903825283050537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3473491370677948, + "epoch": 5.01, + "learning_rate": 2.4949281487743028e-05, + "loss": 0.3818, + "step": 5927, + "task_loss": 1.3895976543426514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33885765075683594, + "epoch": 5.01, + "learning_rate": 2.4945054945054948e-05, + "loss": 0.4322, + "step": 5928, + "task_loss": 0.08260716497898102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5117425918579102, + "epoch": 5.01, + "learning_rate": 2.4940828402366864e-05, + "loss": 0.3731, + "step": 5929, + "task_loss": 0.23022602498531342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3977622091770172, + "epoch": 5.01, + "learning_rate": 2.4936601859678784e-05, + "loss": 0.479, + "step": 5930, + "task_loss": 0.2684060335159302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.608170211315155, + "epoch": 5.01, + "learning_rate": 2.4932375316990703e-05, + "loss": 0.5224, + "step": 5931, + "task_loss": 1.3077064752578735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2939225435256958, + "epoch": 5.01, + "learning_rate": 2.492814877430262e-05, + "loss": 0.3852, + "step": 5932, + "task_loss": 0.43751248717308044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42494404315948486, + "epoch": 5.02, + "learning_rate": 2.492392223161454e-05, + "loss": 0.4691, + "step": 5933, + "task_loss": 1.1505721807479858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19124269485473633, + "epoch": 5.02, + "learning_rate": 2.491969568892646e-05, + "loss": 0.4465, + "step": 5934, + "task_loss": 0.4035099744796753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6567538976669312, + "epoch": 5.02, + "learning_rate": 2.491546914623838e-05, + "loss": 0.431, + "step": 5935, + "task_loss": 0.4618139863014221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3672664165496826, + "epoch": 5.02, + "learning_rate": 2.49112426035503e-05, + "loss": 0.4516, + "step": 5936, + "task_loss": 0.14847822487354279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3138543367385864, + "epoch": 5.02, + "learning_rate": 2.4907016060862215e-05, + "loss": 0.3702, + "step": 5937, + "task_loss": 0.53508061170578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42156708240509033, + "epoch": 5.02, + "learning_rate": 2.4902789518174135e-05, + "loss": 0.456, + "step": 5938, + "task_loss": 0.22390834987163544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6893063187599182, + "epoch": 5.02, + "learning_rate": 2.4898562975486055e-05, + "loss": 0.546, + "step": 5939, + "task_loss": 1.4197015762329102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7896542549133301, + "epoch": 5.02, + "learning_rate": 2.489433643279797e-05, + "loss": 0.5581, + "step": 5940, + "task_loss": 0.5447805523872375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5559636354446411, + "epoch": 5.02, + "learning_rate": 2.489010989010989e-05, + "loss": 0.5189, + "step": 5941, + "task_loss": 1.4816220998764038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7295690774917603, + "epoch": 5.02, + "learning_rate": 2.488588334742181e-05, + "loss": 0.5012, + "step": 5942, + "task_loss": 0.8214719295501709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3193829655647278, + "epoch": 5.02, + "learning_rate": 2.488165680473373e-05, + "loss": 0.5085, + "step": 5943, + "task_loss": 0.42772430181503296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41968590021133423, + "epoch": 5.02, + "learning_rate": 2.4877430262045647e-05, + "loss": 0.5842, + "step": 5944, + "task_loss": 0.9024659395217896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5182512998580933, + "epoch": 5.03, + "learning_rate": 2.4873203719357566e-05, + "loss": 0.5233, + "step": 5945, + "task_loss": 0.6191232800483704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7017192244529724, + "epoch": 5.03, + "learning_rate": 2.4868977176669486e-05, + "loss": 0.4193, + "step": 5946, + "task_loss": 0.28791308403015137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37173032760620117, + "epoch": 5.03, + "learning_rate": 2.4864750633981402e-05, + "loss": 0.587, + "step": 5947, + "task_loss": 0.6532882452011108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24450619518756866, + "epoch": 5.03, + "learning_rate": 2.4860524091293325e-05, + "loss": 0.429, + "step": 5948, + "task_loss": 1.1670103073120117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2502221465110779, + "epoch": 5.03, + "learning_rate": 2.4856297548605242e-05, + "loss": 0.4137, + "step": 5949, + "task_loss": 0.3373726010322571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42184120416641235, + "epoch": 5.03, + "learning_rate": 2.485207100591716e-05, + "loss": 0.4319, + "step": 5950, + "task_loss": 0.5769553184509277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3875751495361328, + "epoch": 5.03, + "learning_rate": 2.484784446322908e-05, + "loss": 0.5062, + "step": 5951, + "task_loss": 0.9066849946975708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7745203971862793, + "epoch": 5.03, + "learning_rate": 2.4843617920540998e-05, + "loss": 0.5165, + "step": 5952, + "task_loss": 0.45830780267715454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.358915239572525, + "epoch": 5.03, + "learning_rate": 2.4839391377852917e-05, + "loss": 0.4424, + "step": 5953, + "task_loss": 0.9698063731193542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42141085863113403, + "epoch": 5.03, + "learning_rate": 2.4835164835164837e-05, + "loss": 0.4029, + "step": 5954, + "task_loss": 0.37209227681159973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8922230005264282, + "epoch": 5.03, + "learning_rate": 2.4830938292476753e-05, + "loss": 0.6084, + "step": 5955, + "task_loss": 0.8888617753982544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4024159014225006, + "epoch": 5.03, + "learning_rate": 2.4826711749788677e-05, + "loss": 0.3417, + "step": 5956, + "task_loss": 0.3548760712146759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25430941581726074, + "epoch": 5.04, + "learning_rate": 2.4822485207100593e-05, + "loss": 0.3777, + "step": 5957, + "task_loss": 0.1606295108795166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4838330149650574, + "epoch": 5.04, + "learning_rate": 2.481825866441251e-05, + "loss": 0.4848, + "step": 5958, + "task_loss": 0.4661611020565033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43831878900527954, + "epoch": 5.04, + "learning_rate": 2.4814032121724432e-05, + "loss": 0.4736, + "step": 5959, + "task_loss": 1.5398967266082764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3489435315132141, + "epoch": 5.04, + "learning_rate": 2.480980557903635e-05, + "loss": 0.4425, + "step": 5960, + "task_loss": 0.31673723459243774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31559956073760986, + "epoch": 5.04, + "learning_rate": 2.480557903634827e-05, + "loss": 0.5137, + "step": 5961, + "task_loss": 0.19639374315738678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6046870946884155, + "epoch": 5.04, + "learning_rate": 2.4801352493660188e-05, + "loss": 0.5283, + "step": 5962, + "task_loss": 0.6970154047012329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7100826501846313, + "epoch": 5.04, + "learning_rate": 2.4797125950972105e-05, + "loss": 0.5779, + "step": 5963, + "task_loss": 1.6279972791671753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5161731839179993, + "epoch": 5.04, + "learning_rate": 2.4792899408284024e-05, + "loss": 0.5408, + "step": 5964, + "task_loss": 0.6651297211647034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5025107860565186, + "epoch": 5.04, + "learning_rate": 2.4788672865595944e-05, + "loss": 0.4349, + "step": 5965, + "task_loss": 1.3200857639312744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.720709502696991, + "epoch": 5.04, + "learning_rate": 2.478444632290786e-05, + "loss": 0.704, + "step": 5966, + "task_loss": 1.0721192359924316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4754113554954529, + "epoch": 5.04, + "learning_rate": 2.4780219780219784e-05, + "loss": 0.457, + "step": 5967, + "task_loss": 0.7590563297271729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5752441883087158, + "epoch": 5.04, + "learning_rate": 2.47759932375317e-05, + "loss": 0.5556, + "step": 5968, + "task_loss": 0.9572807550430298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6993789672851562, + "epoch": 5.05, + "learning_rate": 2.4771766694843616e-05, + "loss": 0.5985, + "step": 5969, + "task_loss": 1.6071454286575317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2880752682685852, + "epoch": 5.05, + "learning_rate": 2.476754015215554e-05, + "loss": 0.5232, + "step": 5970, + "task_loss": 0.16263897716999054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6551597714424133, + "epoch": 5.05, + "learning_rate": 2.4763313609467456e-05, + "loss": 0.4821, + "step": 5971, + "task_loss": 1.11124587059021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44097816944122314, + "epoch": 5.05, + "learning_rate": 2.4759087066779375e-05, + "loss": 0.5146, + "step": 5972, + "task_loss": 0.882554292678833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35184478759765625, + "epoch": 5.05, + "learning_rate": 2.4754860524091295e-05, + "loss": 0.3635, + "step": 5973, + "task_loss": 0.6743343472480774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5512925386428833, + "epoch": 5.05, + "learning_rate": 2.475063398140321e-05, + "loss": 0.5301, + "step": 5974, + "task_loss": 0.5781324505805969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6125876903533936, + "epoch": 5.05, + "learning_rate": 2.474640743871513e-05, + "loss": 0.4122, + "step": 5975, + "task_loss": 0.685910701751709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5098785758018494, + "epoch": 5.05, + "learning_rate": 2.474218089602705e-05, + "loss": 0.4769, + "step": 5976, + "task_loss": 1.2178208827972412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45047181844711304, + "epoch": 5.05, + "learning_rate": 2.473795435333897e-05, + "loss": 0.5641, + "step": 5977, + "task_loss": 0.36383482813835144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40200772881507874, + "epoch": 5.05, + "learning_rate": 2.473372781065089e-05, + "loss": 0.4326, + "step": 5978, + "task_loss": 0.18079520761966705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33880242705345154, + "epoch": 5.05, + "learning_rate": 2.4729501267962807e-05, + "loss": 0.3363, + "step": 5979, + "task_loss": 0.3427959084510803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33176741003990173, + "epoch": 5.05, + "learning_rate": 2.4725274725274727e-05, + "loss": 0.538, + "step": 5980, + "task_loss": 0.29471755027770996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.460461288690567, + "epoch": 5.06, + "learning_rate": 2.4721048182586646e-05, + "loss": 0.4772, + "step": 5981, + "task_loss": 1.1523021459579468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48022761940956116, + "epoch": 5.06, + "learning_rate": 2.4716821639898563e-05, + "loss": 0.5004, + "step": 5982, + "task_loss": 0.948471188545227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37389588356018066, + "epoch": 5.06, + "learning_rate": 2.4712595097210482e-05, + "loss": 0.4084, + "step": 5983, + "task_loss": 0.3665182590484619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.589600682258606, + "epoch": 5.06, + "learning_rate": 2.4708368554522402e-05, + "loss": 0.6166, + "step": 5984, + "task_loss": 0.78522789478302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6796927452087402, + "epoch": 5.06, + "learning_rate": 2.4704142011834322e-05, + "loss": 0.5379, + "step": 5985, + "task_loss": 1.232219934463501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4793018698692322, + "epoch": 5.06, + "learning_rate": 2.4699915469146238e-05, + "loss": 0.4681, + "step": 5986, + "task_loss": 0.5304792523384094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.546089768409729, + "epoch": 5.06, + "learning_rate": 2.4695688926458158e-05, + "loss": 0.6239, + "step": 5987, + "task_loss": 2.0606584548950195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5196162462234497, + "epoch": 5.06, + "learning_rate": 2.4691462383770078e-05, + "loss": 0.4726, + "step": 5988, + "task_loss": 0.33798322081565857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.391682505607605, + "epoch": 5.06, + "learning_rate": 2.4687235841081997e-05, + "loss": 0.4453, + "step": 5989, + "task_loss": 1.0684736967086792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5201181769371033, + "epoch": 5.06, + "learning_rate": 2.4683009298393914e-05, + "loss": 0.5392, + "step": 5990, + "task_loss": 0.6769152879714966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4350537955760956, + "epoch": 5.06, + "learning_rate": 2.4678782755705834e-05, + "loss": 0.5484, + "step": 5991, + "task_loss": 1.0718979835510254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.280165433883667, + "epoch": 5.07, + "learning_rate": 2.4674556213017753e-05, + "loss": 0.3574, + "step": 5992, + "task_loss": 0.17201143503189087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5555096864700317, + "epoch": 5.07, + "learning_rate": 2.4670329670329673e-05, + "loss": 0.5498, + "step": 5993, + "task_loss": 1.1042184829711914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.951995849609375, + "epoch": 5.07, + "learning_rate": 2.466610312764159e-05, + "loss": 0.5201, + "step": 5994, + "task_loss": 1.2007079124450684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.345053493976593, + "epoch": 5.07, + "learning_rate": 2.466187658495351e-05, + "loss": 0.5336, + "step": 5995, + "task_loss": 0.9534551501274109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34661632776260376, + "epoch": 5.07, + "learning_rate": 2.465765004226543e-05, + "loss": 0.4346, + "step": 5996, + "task_loss": 0.8331199884414673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28205406665802, + "epoch": 5.07, + "learning_rate": 2.4653423499577345e-05, + "loss": 0.4945, + "step": 5997, + "task_loss": 0.09619945287704468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3882964253425598, + "epoch": 5.07, + "learning_rate": 2.4649196956889265e-05, + "loss": 0.4401, + "step": 5998, + "task_loss": 0.10834480822086334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39687180519104004, + "epoch": 5.07, + "learning_rate": 2.4644970414201185e-05, + "loss": 0.3267, + "step": 5999, + "task_loss": 0.514348030090332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3934389352798462, + "epoch": 5.07, + "learning_rate": 2.4640743871513104e-05, + "loss": 0.5421, + "step": 6000, + "task_loss": 0.18161803483963013 + }, + { + "epoch": 5.07, + "eval_accuracy": 0.9048712871287129, + "eval_loss": 0.3182193338871002, + "eval_runtime": 230.0646, + "eval_samples_per_second": 109.752, + "eval_steps_per_second": 0.861, + "step": 6000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5449993014335632, + "epoch": 5.07, + "learning_rate": 2.4636517328825024e-05, + "loss": 0.4568, + "step": 6001, + "task_loss": 1.0059036016464233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33474215865135193, + "epoch": 5.07, + "learning_rate": 2.463229078613694e-05, + "loss": 0.5353, + "step": 6002, + "task_loss": 1.0115423202514648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4833511710166931, + "epoch": 5.07, + "learning_rate": 2.462806424344886e-05, + "loss": 0.4584, + "step": 6003, + "task_loss": 0.12505419552326202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5871100425720215, + "epoch": 5.08, + "learning_rate": 2.462383770076078e-05, + "loss": 0.4213, + "step": 6004, + "task_loss": 1.026135802268982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49546924233436584, + "epoch": 5.08, + "learning_rate": 2.4619611158072696e-05, + "loss": 0.4073, + "step": 6005, + "task_loss": 0.8182092905044556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38001054525375366, + "epoch": 5.08, + "learning_rate": 2.461538461538462e-05, + "loss": 0.3664, + "step": 6006, + "task_loss": 0.4444688558578491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46788477897644043, + "epoch": 5.08, + "learning_rate": 2.4611158072696536e-05, + "loss": 0.3951, + "step": 6007, + "task_loss": 0.8514207601547241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3878398537635803, + "epoch": 5.08, + "learning_rate": 2.4606931530008452e-05, + "loss": 0.3701, + "step": 6008, + "task_loss": 0.42015862464904785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43556442856788635, + "epoch": 5.08, + "learning_rate": 2.4602704987320375e-05, + "loss": 0.4738, + "step": 6009, + "task_loss": 0.9199793934822083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32857829332351685, + "epoch": 5.08, + "learning_rate": 2.459847844463229e-05, + "loss": 0.565, + "step": 6010, + "task_loss": 0.17003366351127625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5306485891342163, + "epoch": 5.08, + "learning_rate": 2.4594251901944208e-05, + "loss": 0.5209, + "step": 6011, + "task_loss": 0.6331378221511841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4917563796043396, + "epoch": 5.08, + "learning_rate": 2.459002535925613e-05, + "loss": 0.5475, + "step": 6012, + "task_loss": 0.40922629833221436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7640315294265747, + "epoch": 5.08, + "learning_rate": 2.4585798816568047e-05, + "loss": 0.5034, + "step": 6013, + "task_loss": 1.272016167640686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3246115446090698, + "epoch": 5.08, + "learning_rate": 2.4581572273879967e-05, + "loss": 0.4078, + "step": 6014, + "task_loss": 0.1390765905380249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4721505641937256, + "epoch": 5.08, + "learning_rate": 2.4577345731191887e-05, + "loss": 0.3577, + "step": 6015, + "task_loss": 0.20935770869255066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6544747948646545, + "epoch": 5.09, + "learning_rate": 2.4573119188503803e-05, + "loss": 0.424, + "step": 6016, + "task_loss": 0.7490522861480713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48439788818359375, + "epoch": 5.09, + "learning_rate": 2.4568892645815726e-05, + "loss": 0.5286, + "step": 6017, + "task_loss": 0.4856950342655182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3707444667816162, + "epoch": 5.09, + "learning_rate": 2.4564666103127643e-05, + "loss": 0.438, + "step": 6018, + "task_loss": 0.7697510123252869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31932902336120605, + "epoch": 5.09, + "learning_rate": 2.456043956043956e-05, + "loss": 0.5142, + "step": 6019, + "task_loss": 0.10141529142856598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5338470339775085, + "epoch": 5.09, + "learning_rate": 2.4556213017751482e-05, + "loss": 0.6036, + "step": 6020, + "task_loss": 0.7379311323165894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8293426036834717, + "epoch": 5.09, + "learning_rate": 2.45519864750634e-05, + "loss": 0.5958, + "step": 6021, + "task_loss": 1.1147619485855103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35722851753234863, + "epoch": 5.09, + "learning_rate": 2.454775993237532e-05, + "loss": 0.4757, + "step": 6022, + "task_loss": 0.789341390132904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6629540920257568, + "epoch": 5.09, + "learning_rate": 2.4543533389687238e-05, + "loss": 0.6033, + "step": 6023, + "task_loss": 0.4335033595561981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4034222364425659, + "epoch": 5.09, + "learning_rate": 2.4539306846999154e-05, + "loss": 0.4392, + "step": 6024, + "task_loss": 0.09940929710865021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6093487739562988, + "epoch": 5.09, + "learning_rate": 2.4535080304311074e-05, + "loss": 0.5179, + "step": 6025, + "task_loss": 0.6633531451225281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4297054409980774, + "epoch": 5.09, + "learning_rate": 2.4530853761622994e-05, + "loss": 0.3693, + "step": 6026, + "task_loss": 0.3079739809036255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46619713306427, + "epoch": 5.09, + "learning_rate": 2.452662721893491e-05, + "loss": 0.543, + "step": 6027, + "task_loss": 0.1680336594581604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5810904502868652, + "epoch": 5.1, + "learning_rate": 2.452240067624683e-05, + "loss": 0.5541, + "step": 6028, + "task_loss": 1.3454046249389648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30286654829978943, + "epoch": 5.1, + "learning_rate": 2.451817413355875e-05, + "loss": 0.6656, + "step": 6029, + "task_loss": 0.5029436945915222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7220510244369507, + "epoch": 5.1, + "learning_rate": 2.451394759087067e-05, + "loss": 0.4913, + "step": 6030, + "task_loss": 0.6276025772094727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37081706523895264, + "epoch": 5.1, + "learning_rate": 2.450972104818259e-05, + "loss": 0.3449, + "step": 6031, + "task_loss": 0.11580011248588562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4230744540691376, + "epoch": 5.1, + "learning_rate": 2.4505494505494506e-05, + "loss": 0.4936, + "step": 6032, + "task_loss": 0.4627753496170044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46361660957336426, + "epoch": 5.1, + "learning_rate": 2.4501267962806425e-05, + "loss": 0.4934, + "step": 6033, + "task_loss": 1.1228450536727905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22709789872169495, + "epoch": 5.1, + "learning_rate": 2.4497041420118345e-05, + "loss": 0.3665, + "step": 6034, + "task_loss": 0.06817236542701721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3843761086463928, + "epoch": 5.1, + "learning_rate": 2.4492814877430265e-05, + "loss": 0.4308, + "step": 6035, + "task_loss": 0.8635696172714233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4158199429512024, + "epoch": 5.1, + "learning_rate": 2.448858833474218e-05, + "loss": 0.6124, + "step": 6036, + "task_loss": 0.39460551738739014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5129265785217285, + "epoch": 5.1, + "learning_rate": 2.44843617920541e-05, + "loss": 0.4993, + "step": 6037, + "task_loss": 0.7277722358703613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4055904746055603, + "epoch": 5.1, + "learning_rate": 2.448013524936602e-05, + "loss": 0.5222, + "step": 6038, + "task_loss": 0.611846387386322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21709352731704712, + "epoch": 5.1, + "learning_rate": 2.4475908706677937e-05, + "loss": 0.387, + "step": 6039, + "task_loss": 0.505352258682251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3227002024650574, + "epoch": 5.11, + "learning_rate": 2.4471682163989857e-05, + "loss": 0.3806, + "step": 6040, + "task_loss": 0.4577132761478424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37678706645965576, + "epoch": 5.11, + "learning_rate": 2.4467455621301776e-05, + "loss": 0.6274, + "step": 6041, + "task_loss": 0.09494077414274216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5429909229278564, + "epoch": 5.11, + "learning_rate": 2.4463229078613696e-05, + "loss": 0.4627, + "step": 6042, + "task_loss": 0.44852563738822937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35085201263427734, + "epoch": 5.11, + "learning_rate": 2.4459002535925616e-05, + "loss": 0.3641, + "step": 6043, + "task_loss": 0.9549075365066528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41243740916252136, + "epoch": 5.11, + "learning_rate": 2.4454775993237532e-05, + "loss": 0.4943, + "step": 6044, + "task_loss": 0.877136766910553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4605540633201599, + "epoch": 5.11, + "learning_rate": 2.4450549450549452e-05, + "loss": 0.5337, + "step": 6045, + "task_loss": 0.8325431942939758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31457799673080444, + "epoch": 5.11, + "learning_rate": 2.4446322907861372e-05, + "loss": 0.5164, + "step": 6046, + "task_loss": 1.3069870471954346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45893871784210205, + "epoch": 5.11, + "learning_rate": 2.4442096365173288e-05, + "loss": 0.5375, + "step": 6047, + "task_loss": 0.40529030561447144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31940460205078125, + "epoch": 5.11, + "learning_rate": 2.4437869822485208e-05, + "loss": 0.5391, + "step": 6048, + "task_loss": 0.14863824844360352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49545902013778687, + "epoch": 5.11, + "learning_rate": 2.4433643279797128e-05, + "loss": 0.4807, + "step": 6049, + "task_loss": 0.9401209354400635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3187169134616852, + "epoch": 5.11, + "learning_rate": 2.4429416737109044e-05, + "loss": 0.4381, + "step": 6050, + "task_loss": 0.8888756632804871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42782485485076904, + "epoch": 5.11, + "learning_rate": 2.4425190194420967e-05, + "loss": 0.4939, + "step": 6051, + "task_loss": 0.9968992471694946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5566502213478088, + "epoch": 5.12, + "learning_rate": 2.4420963651732883e-05, + "loss": 0.4957, + "step": 6052, + "task_loss": 0.7148687839508057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47554680705070496, + "epoch": 5.12, + "learning_rate": 2.4416737109044803e-05, + "loss": 0.5468, + "step": 6053, + "task_loss": 0.8989934325218201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7137382626533508, + "epoch": 5.12, + "learning_rate": 2.4412510566356723e-05, + "loss": 0.5279, + "step": 6054, + "task_loss": 1.0508335828781128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48718878626823425, + "epoch": 5.12, + "learning_rate": 2.440828402366864e-05, + "loss": 0.5429, + "step": 6055, + "task_loss": 1.394580602645874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3145267367362976, + "epoch": 5.12, + "learning_rate": 2.440405748098056e-05, + "loss": 0.4607, + "step": 6056, + "task_loss": 0.2770923674106598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6932052969932556, + "epoch": 5.12, + "learning_rate": 2.439983093829248e-05, + "loss": 0.4571, + "step": 6057, + "task_loss": 0.6293013095855713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4722960889339447, + "epoch": 5.12, + "learning_rate": 2.4395604395604395e-05, + "loss": 0.4686, + "step": 6058, + "task_loss": 0.44843149185180664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49339139461517334, + "epoch": 5.12, + "learning_rate": 2.4391377852916318e-05, + "loss": 0.4539, + "step": 6059, + "task_loss": 0.6607227325439453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3622037172317505, + "epoch": 5.12, + "learning_rate": 2.4387151310228235e-05, + "loss": 0.4488, + "step": 6060, + "task_loss": 0.2901977598667145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5500739812850952, + "epoch": 5.12, + "learning_rate": 2.438292476754015e-05, + "loss": 0.7224, + "step": 6061, + "task_loss": 2.1479337215423584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2459859549999237, + "epoch": 5.12, + "learning_rate": 2.4378698224852074e-05, + "loss": 0.3924, + "step": 6062, + "task_loss": 0.14313580095767975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3563840091228485, + "epoch": 5.13, + "learning_rate": 2.437447168216399e-05, + "loss": 0.4015, + "step": 6063, + "task_loss": 0.1891101449728012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6395384669303894, + "epoch": 5.13, + "learning_rate": 2.437024513947591e-05, + "loss": 0.4475, + "step": 6064, + "task_loss": 0.6162099838256836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5693876147270203, + "epoch": 5.13, + "learning_rate": 2.436601859678783e-05, + "loss": 0.4507, + "step": 6065, + "task_loss": 0.4233049154281616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4631159007549286, + "epoch": 5.13, + "learning_rate": 2.4361792054099746e-05, + "loss": 0.4983, + "step": 6066, + "task_loss": 0.49086061120033264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46824151277542114, + "epoch": 5.13, + "learning_rate": 2.4357565511411666e-05, + "loss": 0.6295, + "step": 6067, + "task_loss": 1.0286892652511597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33869636058807373, + "epoch": 5.13, + "learning_rate": 2.4353338968723586e-05, + "loss": 0.588, + "step": 6068, + "task_loss": 0.5210633873939514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3325342535972595, + "epoch": 5.13, + "learning_rate": 2.4349112426035502e-05, + "loss": 0.5058, + "step": 6069, + "task_loss": 0.29091042280197144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4005863070487976, + "epoch": 5.13, + "learning_rate": 2.4344885883347425e-05, + "loss": 0.5489, + "step": 6070, + "task_loss": 0.5245393514633179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6486250162124634, + "epoch": 5.13, + "learning_rate": 2.434065934065934e-05, + "loss": 0.5983, + "step": 6071, + "task_loss": 1.8012388944625854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5699920058250427, + "epoch": 5.13, + "learning_rate": 2.433643279797126e-05, + "loss": 0.6327, + "step": 6072, + "task_loss": 1.0488325357437134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.339619517326355, + "epoch": 5.13, + "learning_rate": 2.433220625528318e-05, + "loss": 0.3847, + "step": 6073, + "task_loss": 0.24479015171527863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44702813029289246, + "epoch": 5.13, + "learning_rate": 2.4327979712595097e-05, + "loss": 0.4508, + "step": 6074, + "task_loss": 0.9232329726219177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6177733540534973, + "epoch": 5.14, + "learning_rate": 2.4323753169907017e-05, + "loss": 0.6332, + "step": 6075, + "task_loss": 0.6015790700912476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2910100817680359, + "epoch": 5.14, + "learning_rate": 2.4319526627218937e-05, + "loss": 0.4757, + "step": 6076, + "task_loss": 0.037308115512132645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6398335695266724, + "epoch": 5.14, + "learning_rate": 2.4315300084530853e-05, + "loss": 0.4791, + "step": 6077, + "task_loss": 1.3564711809158325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5828233957290649, + "epoch": 5.14, + "learning_rate": 2.4311073541842773e-05, + "loss": 0.5594, + "step": 6078, + "task_loss": 1.3108546733856201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4572860896587372, + "epoch": 5.14, + "learning_rate": 2.4306846999154693e-05, + "loss": 0.5371, + "step": 6079, + "task_loss": 0.5129013061523438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31945663690567017, + "epoch": 5.14, + "learning_rate": 2.4302620456466612e-05, + "loss": 0.4847, + "step": 6080, + "task_loss": 0.2803064286708832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35488054156303406, + "epoch": 5.14, + "learning_rate": 2.4298393913778532e-05, + "loss": 0.4644, + "step": 6081, + "task_loss": 0.09514985233545303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2647809088230133, + "epoch": 5.14, + "learning_rate": 2.429416737109045e-05, + "loss": 0.4341, + "step": 6082, + "task_loss": 0.7444067001342773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37407463788986206, + "epoch": 5.14, + "learning_rate": 2.4289940828402368e-05, + "loss": 0.5926, + "step": 6083, + "task_loss": 0.8440563678741455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33316361904144287, + "epoch": 5.14, + "learning_rate": 2.4285714285714288e-05, + "loss": 0.42, + "step": 6084, + "task_loss": 0.5290902853012085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20633062720298767, + "epoch": 5.14, + "learning_rate": 2.4281487743026204e-05, + "loss": 0.644, + "step": 6085, + "task_loss": 0.6980563402175903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4087545573711395, + "epoch": 5.14, + "learning_rate": 2.4277261200338124e-05, + "loss": 0.5343, + "step": 6086, + "task_loss": 0.6245479583740234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8807015419006348, + "epoch": 5.15, + "learning_rate": 2.4273034657650044e-05, + "loss": 0.6492, + "step": 6087, + "task_loss": 0.5944237112998962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7398768663406372, + "epoch": 5.15, + "learning_rate": 2.4268808114961964e-05, + "loss": 0.624, + "step": 6088, + "task_loss": 0.6951228380203247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7016762495040894, + "epoch": 5.15, + "learning_rate": 2.426458157227388e-05, + "loss": 0.4959, + "step": 6089, + "task_loss": 0.6524354219436646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4098488688468933, + "epoch": 5.15, + "learning_rate": 2.42603550295858e-05, + "loss": 0.4084, + "step": 6090, + "task_loss": 0.17840828001499176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33483564853668213, + "epoch": 5.15, + "learning_rate": 2.425612848689772e-05, + "loss": 0.4244, + "step": 6091, + "task_loss": 0.4534497857093811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.500361979007721, + "epoch": 5.15, + "learning_rate": 2.4251901944209636e-05, + "loss": 0.4635, + "step": 6092, + "task_loss": 0.5212643146514893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49990272521972656, + "epoch": 5.15, + "learning_rate": 2.4247675401521555e-05, + "loss": 0.6138, + "step": 6093, + "task_loss": 0.792811393737793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36293935775756836, + "epoch": 5.15, + "learning_rate": 2.4243448858833475e-05, + "loss": 0.4638, + "step": 6094, + "task_loss": 0.29829829931259155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4958467483520508, + "epoch": 5.15, + "learning_rate": 2.4239222316145395e-05, + "loss": 0.5225, + "step": 6095, + "task_loss": 0.779597818851471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5758122205734253, + "epoch": 5.15, + "learning_rate": 2.4234995773457315e-05, + "loss": 0.6499, + "step": 6096, + "task_loss": 1.5735408067703247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4639711380004883, + "epoch": 5.15, + "learning_rate": 2.423076923076923e-05, + "loss": 0.4198, + "step": 6097, + "task_loss": 0.8647044897079468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4379214644432068, + "epoch": 5.15, + "learning_rate": 2.422654268808115e-05, + "loss": 0.6026, + "step": 6098, + "task_loss": 1.3584641218185425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5387359857559204, + "epoch": 5.16, + "learning_rate": 2.422231614539307e-05, + "loss": 0.5422, + "step": 6099, + "task_loss": 0.8965391516685486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27668148279190063, + "epoch": 5.16, + "learning_rate": 2.4218089602704987e-05, + "loss": 0.5114, + "step": 6100, + "task_loss": 0.42457109689712524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7864424586296082, + "epoch": 5.16, + "learning_rate": 2.421386306001691e-05, + "loss": 0.6489, + "step": 6101, + "task_loss": 2.0635387897491455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4932714104652405, + "epoch": 5.16, + "learning_rate": 2.4209636517328826e-05, + "loss": 0.5737, + "step": 6102, + "task_loss": 0.30863773822784424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5333161354064941, + "epoch": 5.16, + "learning_rate": 2.4205409974640743e-05, + "loss": 0.5204, + "step": 6103, + "task_loss": 1.0516878366470337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3094266355037689, + "epoch": 5.16, + "learning_rate": 2.4201183431952666e-05, + "loss": 0.3886, + "step": 6104, + "task_loss": 0.4357942044734955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6896642446517944, + "epoch": 5.16, + "learning_rate": 2.4196956889264582e-05, + "loss": 0.448, + "step": 6105, + "task_loss": 0.25991523265838623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7091655135154724, + "epoch": 5.16, + "learning_rate": 2.4192730346576502e-05, + "loss": 0.5194, + "step": 6106, + "task_loss": 0.48243609070777893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32731813192367554, + "epoch": 5.16, + "learning_rate": 2.418850380388842e-05, + "loss": 0.4597, + "step": 6107, + "task_loss": 0.7372286319732666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36532658338546753, + "epoch": 5.16, + "learning_rate": 2.4184277261200338e-05, + "loss": 0.5513, + "step": 6108, + "task_loss": 0.5837984681129456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36492544412612915, + "epoch": 5.16, + "learning_rate": 2.4180050718512258e-05, + "loss": 0.5102, + "step": 6109, + "task_loss": 1.5129467248916626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4459283947944641, + "epoch": 5.16, + "learning_rate": 2.4175824175824177e-05, + "loss": 0.3916, + "step": 6110, + "task_loss": 1.0544507503509521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44582539796829224, + "epoch": 5.17, + "learning_rate": 2.4171597633136094e-05, + "loss": 0.5348, + "step": 6111, + "task_loss": 0.522088885307312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5459232926368713, + "epoch": 5.17, + "learning_rate": 2.4167371090448017e-05, + "loss": 0.5271, + "step": 6112, + "task_loss": 0.7063331007957458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4671485424041748, + "epoch": 5.17, + "learning_rate": 2.4163144547759933e-05, + "loss": 0.4875, + "step": 6113, + "task_loss": 0.6119168400764465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6656586527824402, + "epoch": 5.17, + "learning_rate": 2.415891800507185e-05, + "loss": 0.5573, + "step": 6114, + "task_loss": 0.8138802647590637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49643921852111816, + "epoch": 5.17, + "learning_rate": 2.4154691462383773e-05, + "loss": 0.5104, + "step": 6115, + "task_loss": 0.15977782011032104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40386420488357544, + "epoch": 5.17, + "learning_rate": 2.415046491969569e-05, + "loss": 0.4368, + "step": 6116, + "task_loss": 0.763613224029541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5230156183242798, + "epoch": 5.17, + "learning_rate": 2.414623837700761e-05, + "loss": 0.5556, + "step": 6117, + "task_loss": 0.2062350958585739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3846873342990875, + "epoch": 5.17, + "learning_rate": 2.414201183431953e-05, + "loss": 0.5138, + "step": 6118, + "task_loss": 0.8341571092605591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4112198054790497, + "epoch": 5.17, + "learning_rate": 2.4137785291631445e-05, + "loss": 0.4862, + "step": 6119, + "task_loss": 0.30310359597206116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4547218978404999, + "epoch": 5.17, + "learning_rate": 2.4133558748943365e-05, + "loss": 0.3853, + "step": 6120, + "task_loss": 0.25066572427749634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3382847011089325, + "epoch": 5.17, + "learning_rate": 2.4129332206255284e-05, + "loss": 0.5548, + "step": 6121, + "task_loss": 0.8832724094390869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46316203474998474, + "epoch": 5.17, + "learning_rate": 2.41251056635672e-05, + "loss": 0.6726, + "step": 6122, + "task_loss": 0.7226827144622803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8395370244979858, + "epoch": 5.18, + "learning_rate": 2.4120879120879124e-05, + "loss": 0.5376, + "step": 6123, + "task_loss": 0.6016039848327637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4850802421569824, + "epoch": 5.18, + "learning_rate": 2.411665257819104e-05, + "loss": 0.4582, + "step": 6124, + "task_loss": 0.4139668643474579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5096008777618408, + "epoch": 5.18, + "learning_rate": 2.411242603550296e-05, + "loss": 0.4657, + "step": 6125, + "task_loss": 0.9118910431861877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8254024982452393, + "epoch": 5.18, + "learning_rate": 2.410819949281488e-05, + "loss": 0.5861, + "step": 6126, + "task_loss": 0.8758968114852905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39744144678115845, + "epoch": 5.18, + "learning_rate": 2.4103972950126796e-05, + "loss": 0.4624, + "step": 6127, + "task_loss": 0.12627103924751282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0045619010925293, + "epoch": 5.18, + "learning_rate": 2.4099746407438716e-05, + "loss": 0.5819, + "step": 6128, + "task_loss": 0.5143957734107971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5418936014175415, + "epoch": 5.18, + "learning_rate": 2.4095519864750636e-05, + "loss": 0.4532, + "step": 6129, + "task_loss": 1.229182243347168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47781550884246826, + "epoch": 5.18, + "learning_rate": 2.4091293322062555e-05, + "loss": 0.381, + "step": 6130, + "task_loss": 0.8751018643379211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4547562599182129, + "epoch": 5.18, + "learning_rate": 2.408706677937447e-05, + "loss": 0.4861, + "step": 6131, + "task_loss": 1.2296758890151978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.516660213470459, + "epoch": 5.18, + "learning_rate": 2.408284023668639e-05, + "loss": 0.5374, + "step": 6132, + "task_loss": 1.1468462944030762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44642311334609985, + "epoch": 5.18, + "learning_rate": 2.407861369399831e-05, + "loss": 0.4429, + "step": 6133, + "task_loss": 0.3790968060493469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6537061929702759, + "epoch": 5.19, + "learning_rate": 2.407438715131023e-05, + "loss": 0.5378, + "step": 6134, + "task_loss": 1.113348364830017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3634786903858185, + "epoch": 5.19, + "learning_rate": 2.4070160608622147e-05, + "loss": 0.4635, + "step": 6135, + "task_loss": 0.47725826501846313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3838973641395569, + "epoch": 5.19, + "learning_rate": 2.4065934065934067e-05, + "loss": 0.3957, + "step": 6136, + "task_loss": 1.3570797443389893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4662180244922638, + "epoch": 5.19, + "learning_rate": 2.4061707523245987e-05, + "loss": 0.3766, + "step": 6137, + "task_loss": 1.1613508462905884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48386332392692566, + "epoch": 5.19, + "learning_rate": 2.4057480980557906e-05, + "loss": 0.4792, + "step": 6138, + "task_loss": 0.8293060660362244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27913257479667664, + "epoch": 5.19, + "learning_rate": 2.4053254437869823e-05, + "loss": 0.3979, + "step": 6139, + "task_loss": 0.08585330098867416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5151056051254272, + "epoch": 5.19, + "learning_rate": 2.4049027895181742e-05, + "loss": 0.3968, + "step": 6140, + "task_loss": 0.1384936422109604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4784975051879883, + "epoch": 5.19, + "learning_rate": 2.4044801352493662e-05, + "loss": 0.4819, + "step": 6141, + "task_loss": 0.8413895964622498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28375178575515747, + "epoch": 5.19, + "learning_rate": 2.404057480980558e-05, + "loss": 0.4497, + "step": 6142, + "task_loss": 0.18109892308712006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.270940899848938, + "epoch": 5.19, + "learning_rate": 2.40363482671175e-05, + "loss": 0.502, + "step": 6143, + "task_loss": 0.07345021516084671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3857145309448242, + "epoch": 5.19, + "learning_rate": 2.4032121724429418e-05, + "loss": 0.433, + "step": 6144, + "task_loss": 0.6595595479011536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38571372628211975, + "epoch": 5.19, + "learning_rate": 2.4027895181741338e-05, + "loss": 0.5561, + "step": 6145, + "task_loss": 0.6467337608337402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3356383442878723, + "epoch": 5.2, + "learning_rate": 2.4023668639053258e-05, + "loss": 0.3876, + "step": 6146, + "task_loss": 0.5467643141746521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33924928307533264, + "epoch": 5.2, + "learning_rate": 2.4019442096365174e-05, + "loss": 0.4639, + "step": 6147, + "task_loss": 0.31140634417533875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4599660038948059, + "epoch": 5.2, + "learning_rate": 2.4015215553677094e-05, + "loss": 0.4027, + "step": 6148, + "task_loss": 0.6616432666778564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39545682072639465, + "epoch": 5.2, + "learning_rate": 2.4010989010989013e-05, + "loss": 0.4071, + "step": 6149, + "task_loss": 0.6132952570915222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3314233422279358, + "epoch": 5.2, + "learning_rate": 2.400676246830093e-05, + "loss": 0.456, + "step": 6150, + "task_loss": 0.6794623732566833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42838236689567566, + "epoch": 5.2, + "learning_rate": 2.400253592561285e-05, + "loss": 0.4387, + "step": 6151, + "task_loss": 0.494686484336853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30689436197280884, + "epoch": 5.2, + "learning_rate": 2.399830938292477e-05, + "loss": 0.4499, + "step": 6152, + "task_loss": 0.21913358569145203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5312821865081787, + "epoch": 5.2, + "learning_rate": 2.3994082840236686e-05, + "loss": 0.5279, + "step": 6153, + "task_loss": 1.0860376358032227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3166561722755432, + "epoch": 5.2, + "learning_rate": 2.398985629754861e-05, + "loss": 0.479, + "step": 6154, + "task_loss": 0.43716490268707275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7473480701446533, + "epoch": 5.2, + "learning_rate": 2.3985629754860525e-05, + "loss": 0.4912, + "step": 6155, + "task_loss": 0.6622763872146606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4961322844028473, + "epoch": 5.2, + "learning_rate": 2.398140321217244e-05, + "loss": 0.5546, + "step": 6156, + "task_loss": 1.2248502969741821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5788621306419373, + "epoch": 5.2, + "learning_rate": 2.3977176669484364e-05, + "loss": 0.5357, + "step": 6157, + "task_loss": 1.2150005102157593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4575561583042145, + "epoch": 5.21, + "learning_rate": 2.397295012679628e-05, + "loss": 0.4225, + "step": 6158, + "task_loss": 1.0713618993759155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42552024126052856, + "epoch": 5.21, + "learning_rate": 2.39687235841082e-05, + "loss": 0.4823, + "step": 6159, + "task_loss": 0.9159592986106873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5420217514038086, + "epoch": 5.21, + "learning_rate": 2.396449704142012e-05, + "loss": 0.4762, + "step": 6160, + "task_loss": 0.15733765065670013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5595120787620544, + "epoch": 5.21, + "learning_rate": 2.3960270498732037e-05, + "loss": 0.5666, + "step": 6161, + "task_loss": 0.52796870470047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5384198427200317, + "epoch": 5.21, + "learning_rate": 2.395604395604396e-05, + "loss": 0.5238, + "step": 6162, + "task_loss": 1.5352758169174194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41288265585899353, + "epoch": 5.21, + "learning_rate": 2.3951817413355876e-05, + "loss": 0.5318, + "step": 6163, + "task_loss": 1.0586518049240112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30877485871315, + "epoch": 5.21, + "learning_rate": 2.3947590870667792e-05, + "loss": 0.401, + "step": 6164, + "task_loss": 0.4690262973308563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31064373254776, + "epoch": 5.21, + "learning_rate": 2.3943364327979716e-05, + "loss": 0.378, + "step": 6165, + "task_loss": 0.7250860333442688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4359944760799408, + "epoch": 5.21, + "learning_rate": 2.3939137785291632e-05, + "loss": 0.3895, + "step": 6166, + "task_loss": 0.3688778877258301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5869047045707703, + "epoch": 5.21, + "learning_rate": 2.3934911242603552e-05, + "loss": 0.7003, + "step": 6167, + "task_loss": 0.5083137154579163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45609742403030396, + "epoch": 5.21, + "learning_rate": 2.393068469991547e-05, + "loss": 0.5405, + "step": 6168, + "task_loss": 0.7061580419540405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27940666675567627, + "epoch": 5.21, + "learning_rate": 2.3926458157227388e-05, + "loss": 0.4363, + "step": 6169, + "task_loss": 0.7165878415107727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4445539712905884, + "epoch": 5.22, + "learning_rate": 2.3922231614539308e-05, + "loss": 0.483, + "step": 6170, + "task_loss": 0.7307248115539551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48014184832572937, + "epoch": 5.22, + "learning_rate": 2.3918005071851227e-05, + "loss": 0.415, + "step": 6171, + "task_loss": 1.2688418626785278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4180171489715576, + "epoch": 5.22, + "learning_rate": 2.3913778529163144e-05, + "loss": 0.4173, + "step": 6172, + "task_loss": 0.5772649645805359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38885343074798584, + "epoch": 5.22, + "learning_rate": 2.3909551986475063e-05, + "loss": 0.5125, + "step": 6173, + "task_loss": 0.37760668992996216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35532480478286743, + "epoch": 5.22, + "learning_rate": 2.3905325443786983e-05, + "loss": 0.4287, + "step": 6174, + "task_loss": 0.7236931324005127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34226906299591064, + "epoch": 5.22, + "learning_rate": 2.3901098901098903e-05, + "loss": 0.5838, + "step": 6175, + "task_loss": 1.1071233749389648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20892786979675293, + "epoch": 5.22, + "learning_rate": 2.3896872358410823e-05, + "loss": 0.4738, + "step": 6176, + "task_loss": 1.0729855298995972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2818046808242798, + "epoch": 5.22, + "learning_rate": 2.389264581572274e-05, + "loss": 0.4685, + "step": 6177, + "task_loss": 0.6242684125900269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5857528448104858, + "epoch": 5.22, + "learning_rate": 2.388841927303466e-05, + "loss": 0.3985, + "step": 6178, + "task_loss": 0.8326637744903564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6040318608283997, + "epoch": 5.22, + "learning_rate": 2.388419273034658e-05, + "loss": 0.5979, + "step": 6179, + "task_loss": 0.8495544195175171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5291112065315247, + "epoch": 5.22, + "learning_rate": 2.3879966187658495e-05, + "loss": 0.4013, + "step": 6180, + "task_loss": 0.14762166142463684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29011183977127075, + "epoch": 5.22, + "learning_rate": 2.3875739644970414e-05, + "loss": 0.4998, + "step": 6181, + "task_loss": 0.8502899408340454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.54482102394104, + "epoch": 5.23, + "learning_rate": 2.3871513102282334e-05, + "loss": 0.4466, + "step": 6182, + "task_loss": 0.47218915820121765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4998707175254822, + "epoch": 5.23, + "learning_rate": 2.3867286559594254e-05, + "loss": 0.484, + "step": 6183, + "task_loss": 1.1995604038238525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5405217409133911, + "epoch": 5.23, + "learning_rate": 2.386306001690617e-05, + "loss": 0.5226, + "step": 6184, + "task_loss": 0.6516919732093811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40171027183532715, + "epoch": 5.23, + "learning_rate": 2.385883347421809e-05, + "loss": 0.5009, + "step": 6185, + "task_loss": 1.1580421924591064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3655277490615845, + "epoch": 5.23, + "learning_rate": 2.385460693153001e-05, + "loss": 0.4911, + "step": 6186, + "task_loss": 0.7408225536346436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38081884384155273, + "epoch": 5.23, + "learning_rate": 2.385038038884193e-05, + "loss": 0.4583, + "step": 6187, + "task_loss": 0.7897555232048035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35455620288848877, + "epoch": 5.23, + "learning_rate": 2.384615384615385e-05, + "loss": 0.5004, + "step": 6188, + "task_loss": 0.08219151943922043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24608838558197021, + "epoch": 5.23, + "learning_rate": 2.3841927303465766e-05, + "loss": 0.432, + "step": 6189, + "task_loss": 0.10677923262119293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38481777906417847, + "epoch": 5.23, + "learning_rate": 2.3837700760777685e-05, + "loss": 0.4889, + "step": 6190, + "task_loss": 0.14600814878940582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27503272891044617, + "epoch": 5.23, + "learning_rate": 2.3833474218089605e-05, + "loss": 0.3807, + "step": 6191, + "task_loss": 0.5679135322570801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6669226288795471, + "epoch": 5.23, + "learning_rate": 2.382924767540152e-05, + "loss": 0.688, + "step": 6192, + "task_loss": 0.7145673036575317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31415343284606934, + "epoch": 5.23, + "learning_rate": 2.382502113271344e-05, + "loss": 0.3753, + "step": 6193, + "task_loss": 0.5760126709938049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44989722967147827, + "epoch": 5.24, + "learning_rate": 2.382079459002536e-05, + "loss": 0.6227, + "step": 6194, + "task_loss": 0.6788726449012756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28375405073165894, + "epoch": 5.24, + "learning_rate": 2.3816568047337277e-05, + "loss": 0.4985, + "step": 6195, + "task_loss": 0.17066077888011932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36997678875923157, + "epoch": 5.24, + "learning_rate": 2.38123415046492e-05, + "loss": 0.3768, + "step": 6196, + "task_loss": 0.5197086930274963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3998801112174988, + "epoch": 5.24, + "learning_rate": 2.3808114961961117e-05, + "loss": 0.4801, + "step": 6197, + "task_loss": 0.848540186882019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33236798644065857, + "epoch": 5.24, + "learning_rate": 2.3803888419273036e-05, + "loss": 0.3891, + "step": 6198, + "task_loss": 0.03807831183075905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38963958621025085, + "epoch": 5.24, + "learning_rate": 2.3799661876584956e-05, + "loss": 0.5063, + "step": 6199, + "task_loss": 0.21854987740516663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40772736072540283, + "epoch": 5.24, + "learning_rate": 2.3795435333896873e-05, + "loss": 0.5142, + "step": 6200, + "task_loss": 0.4438783824443817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3230104446411133, + "epoch": 5.24, + "learning_rate": 2.3791208791208792e-05, + "loss": 0.4439, + "step": 6201, + "task_loss": 0.040172476321458817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42724302411079407, + "epoch": 5.24, + "learning_rate": 2.3786982248520712e-05, + "loss": 0.5624, + "step": 6202, + "task_loss": 0.5504933595657349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22374588251113892, + "epoch": 5.24, + "learning_rate": 2.378275570583263e-05, + "loss": 0.4967, + "step": 6203, + "task_loss": 0.1483563482761383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2604157030582428, + "epoch": 5.24, + "learning_rate": 2.377852916314455e-05, + "loss": 0.4024, + "step": 6204, + "task_loss": 0.2101244032382965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3066834807395935, + "epoch": 5.24, + "learning_rate": 2.3774302620456468e-05, + "loss": 0.5404, + "step": 6205, + "task_loss": 0.6719635725021362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5508102178573608, + "epoch": 5.25, + "learning_rate": 2.3770076077768384e-05, + "loss": 0.4349, + "step": 6206, + "task_loss": 0.819719135761261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4748684763908386, + "epoch": 5.25, + "learning_rate": 2.3765849535080307e-05, + "loss": 0.5037, + "step": 6207, + "task_loss": 1.0238730907440186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29316431283950806, + "epoch": 5.25, + "learning_rate": 2.3761622992392224e-05, + "loss": 0.4317, + "step": 6208, + "task_loss": 0.1587817668914795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.459617555141449, + "epoch": 5.25, + "learning_rate": 2.3757396449704143e-05, + "loss": 0.5014, + "step": 6209, + "task_loss": 0.37662234902381897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6899843811988831, + "epoch": 5.25, + "learning_rate": 2.3753169907016063e-05, + "loss": 0.4618, + "step": 6210, + "task_loss": 0.5653902888298035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4633323848247528, + "epoch": 5.25, + "learning_rate": 2.374894336432798e-05, + "loss": 0.5622, + "step": 6211, + "task_loss": 0.5838003158569336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9908839464187622, + "epoch": 5.25, + "learning_rate": 2.37447168216399e-05, + "loss": 0.5773, + "step": 6212, + "task_loss": 0.7032069563865662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40244072675704956, + "epoch": 5.25, + "learning_rate": 2.374049027895182e-05, + "loss": 0.4676, + "step": 6213, + "task_loss": 0.23335306346416473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6343683004379272, + "epoch": 5.25, + "learning_rate": 2.3736263736263735e-05, + "loss": 0.4552, + "step": 6214, + "task_loss": 1.1770395040512085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49519848823547363, + "epoch": 5.25, + "learning_rate": 2.373203719357566e-05, + "loss": 0.5787, + "step": 6215, + "task_loss": 1.8557019233703613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0788205862045288, + "epoch": 5.25, + "learning_rate": 2.3727810650887575e-05, + "loss": 0.6002, + "step": 6216, + "task_loss": 1.432248592376709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3671010732650757, + "epoch": 5.26, + "learning_rate": 2.3723584108199495e-05, + "loss": 0.3877, + "step": 6217, + "task_loss": 0.8160368204116821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14648397266864777, + "epoch": 5.26, + "learning_rate": 2.3719357565511414e-05, + "loss": 0.4201, + "step": 6218, + "task_loss": 0.5890552997589111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3210896849632263, + "epoch": 5.26, + "learning_rate": 2.371513102282333e-05, + "loss": 0.3525, + "step": 6219, + "task_loss": 0.11566920578479767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48766273260116577, + "epoch": 5.26, + "learning_rate": 2.371090448013525e-05, + "loss": 0.5473, + "step": 6220, + "task_loss": 1.1073604822158813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.589460015296936, + "epoch": 5.26, + "learning_rate": 2.370667793744717e-05, + "loss": 0.4836, + "step": 6221, + "task_loss": 0.1785520315170288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27041614055633545, + "epoch": 5.26, + "learning_rate": 2.3702451394759087e-05, + "loss": 0.4561, + "step": 6222, + "task_loss": 0.2553998827934265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33399829268455505, + "epoch": 5.26, + "learning_rate": 2.3698224852071006e-05, + "loss": 0.421, + "step": 6223, + "task_loss": 0.15752455592155457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31323787569999695, + "epoch": 5.26, + "learning_rate": 2.3693998309382926e-05, + "loss": 0.4284, + "step": 6224, + "task_loss": 0.21446283161640167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5507957935333252, + "epoch": 5.26, + "learning_rate": 2.3689771766694846e-05, + "loss": 0.5554, + "step": 6225, + "task_loss": 1.9506876468658447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44343873858451843, + "epoch": 5.26, + "learning_rate": 2.3685545224006765e-05, + "loss": 0.3812, + "step": 6226, + "task_loss": 0.5801596641540527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3441895842552185, + "epoch": 5.26, + "learning_rate": 2.3681318681318682e-05, + "loss": 0.4325, + "step": 6227, + "task_loss": 0.7725982069969177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7963119149208069, + "epoch": 5.26, + "learning_rate": 2.36770921386306e-05, + "loss": 0.6446, + "step": 6228, + "task_loss": 1.8608191013336182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.575245201587677, + "epoch": 5.27, + "learning_rate": 2.367286559594252e-05, + "loss": 0.4851, + "step": 6229, + "task_loss": 0.590059220790863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.537763774394989, + "epoch": 5.27, + "learning_rate": 2.3668639053254438e-05, + "loss": 0.5185, + "step": 6230, + "task_loss": 0.6747527718544006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25006818771362305, + "epoch": 5.27, + "learning_rate": 2.3664412510566357e-05, + "loss": 0.4855, + "step": 6231, + "task_loss": 0.3574564456939697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.12133331596851349, + "epoch": 5.27, + "learning_rate": 2.3660185967878277e-05, + "loss": 0.4084, + "step": 6232, + "task_loss": 0.015367194078862667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5480034351348877, + "epoch": 5.27, + "learning_rate": 2.3655959425190197e-05, + "loss": 0.527, + "step": 6233, + "task_loss": 1.6213537454605103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44211721420288086, + "epoch": 5.27, + "learning_rate": 2.3651732882502113e-05, + "loss": 0.5621, + "step": 6234, + "task_loss": 0.9960834980010986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34656888246536255, + "epoch": 5.27, + "learning_rate": 2.3647506339814033e-05, + "loss": 0.4312, + "step": 6235, + "task_loss": 0.2736213505268097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5731849670410156, + "epoch": 5.27, + "learning_rate": 2.3643279797125953e-05, + "loss": 0.4383, + "step": 6236, + "task_loss": 0.29941362142562866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4600813388824463, + "epoch": 5.27, + "learning_rate": 2.363905325443787e-05, + "loss": 0.4341, + "step": 6237, + "task_loss": 0.36316174268722534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6788662075996399, + "epoch": 5.27, + "learning_rate": 2.363482671174979e-05, + "loss": 0.5862, + "step": 6238, + "task_loss": 0.7667924165725708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6313486099243164, + "epoch": 5.27, + "learning_rate": 2.363060016906171e-05, + "loss": 0.3478, + "step": 6239, + "task_loss": 0.39742758870124817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7032457590103149, + "epoch": 5.27, + "learning_rate": 2.3626373626373628e-05, + "loss": 0.5508, + "step": 6240, + "task_loss": 0.26318061351776123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6162355542182922, + "epoch": 5.28, + "learning_rate": 2.3622147083685548e-05, + "loss": 0.4791, + "step": 6241, + "task_loss": 1.9621481895446777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5780901908874512, + "epoch": 5.28, + "learning_rate": 2.3617920540997464e-05, + "loss": 0.4403, + "step": 6242, + "task_loss": 0.7662016153335571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4235873818397522, + "epoch": 5.28, + "learning_rate": 2.3613693998309384e-05, + "loss": 0.4591, + "step": 6243, + "task_loss": 1.1155574321746826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2787354588508606, + "epoch": 5.28, + "learning_rate": 2.3609467455621304e-05, + "loss": 0.3899, + "step": 6244, + "task_loss": 0.6909473538398743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44485950469970703, + "epoch": 5.28, + "learning_rate": 2.360524091293322e-05, + "loss": 0.5258, + "step": 6245, + "task_loss": 0.7350509166717529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3484078645706177, + "epoch": 5.28, + "learning_rate": 2.3601014370245143e-05, + "loss": 0.3873, + "step": 6246, + "task_loss": 0.49561452865600586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5656984448432922, + "epoch": 5.28, + "learning_rate": 2.359678782755706e-05, + "loss": 0.5469, + "step": 6247, + "task_loss": 0.6476038098335266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29890021681785583, + "epoch": 5.28, + "learning_rate": 2.3592561284868976e-05, + "loss": 0.3794, + "step": 6248, + "task_loss": 0.34847554564476013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5308851003646851, + "epoch": 5.28, + "learning_rate": 2.35883347421809e-05, + "loss": 0.4009, + "step": 6249, + "task_loss": 1.101190209388733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6128857135772705, + "epoch": 5.28, + "learning_rate": 2.3584108199492815e-05, + "loss": 0.4499, + "step": 6250, + "task_loss": 1.1265512704849243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16186052560806274, + "epoch": 5.28, + "learning_rate": 2.3579881656804735e-05, + "loss": 0.4538, + "step": 6251, + "task_loss": 1.03965425491333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4434671401977539, + "epoch": 5.28, + "learning_rate": 2.3575655114116655e-05, + "loss": 0.47, + "step": 6252, + "task_loss": 0.6152116656303406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28809115290641785, + "epoch": 5.29, + "learning_rate": 2.357142857142857e-05, + "loss": 0.4019, + "step": 6253, + "task_loss": 0.48627209663391113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3234362006187439, + "epoch": 5.29, + "learning_rate": 2.356720202874049e-05, + "loss": 0.46, + "step": 6254, + "task_loss": 0.4574730694293976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5012171268463135, + "epoch": 5.29, + "learning_rate": 2.356297548605241e-05, + "loss": 0.4611, + "step": 6255, + "task_loss": 0.7161834836006165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6277874708175659, + "epoch": 5.29, + "learning_rate": 2.3558748943364327e-05, + "loss": 0.5396, + "step": 6256, + "task_loss": 0.9952029585838318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5955291390419006, + "epoch": 5.29, + "learning_rate": 2.355452240067625e-05, + "loss": 0.4362, + "step": 6257, + "task_loss": 0.4639523923397064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4481378495693207, + "epoch": 5.29, + "learning_rate": 2.3550295857988167e-05, + "loss": 0.5522, + "step": 6258, + "task_loss": 0.4169240891933441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4396369457244873, + "epoch": 5.29, + "learning_rate": 2.3546069315300083e-05, + "loss": 0.5029, + "step": 6259, + "task_loss": 0.200527161359787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34670278429985046, + "epoch": 5.29, + "learning_rate": 2.3541842772612006e-05, + "loss": 0.4039, + "step": 6260, + "task_loss": 1.1637672185897827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39813128113746643, + "epoch": 5.29, + "learning_rate": 2.3537616229923922e-05, + "loss": 0.4282, + "step": 6261, + "task_loss": 0.27429771423339844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6128538846969604, + "epoch": 5.29, + "learning_rate": 2.3533389687235842e-05, + "loss": 0.4492, + "step": 6262, + "task_loss": 0.508765697479248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48425254225730896, + "epoch": 5.29, + "learning_rate": 2.3529163144547762e-05, + "loss": 0.5545, + "step": 6263, + "task_loss": 0.07327888906002045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4513271152973175, + "epoch": 5.29, + "learning_rate": 2.3524936601859678e-05, + "loss": 0.4591, + "step": 6264, + "task_loss": 1.0417494773864746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5475974082946777, + "epoch": 5.3, + "learning_rate": 2.3520710059171598e-05, + "loss": 0.5554, + "step": 6265, + "task_loss": 0.5591528415679932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3692634701728821, + "epoch": 5.3, + "learning_rate": 2.3516483516483518e-05, + "loss": 0.4618, + "step": 6266, + "task_loss": 0.7540041208267212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40020036697387695, + "epoch": 5.3, + "learning_rate": 2.3512256973795434e-05, + "loss": 0.4721, + "step": 6267, + "task_loss": 0.2871745824813843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34949609637260437, + "epoch": 5.3, + "learning_rate": 2.3508030431107357e-05, + "loss": 0.4112, + "step": 6268, + "task_loss": 0.6027837991714478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49232056736946106, + "epoch": 5.3, + "learning_rate": 2.3503803888419274e-05, + "loss": 0.3764, + "step": 6269, + "task_loss": 0.4374786615371704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33673879504203796, + "epoch": 5.3, + "learning_rate": 2.3499577345731193e-05, + "loss": 0.6507, + "step": 6270, + "task_loss": 0.06498978286981583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3401879072189331, + "epoch": 5.3, + "learning_rate": 2.3495350803043113e-05, + "loss": 0.4364, + "step": 6271, + "task_loss": 0.2961442470550537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6984174251556396, + "epoch": 5.3, + "learning_rate": 2.349112426035503e-05, + "loss": 0.6421, + "step": 6272, + "task_loss": 1.2203418016433716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4302659034729004, + "epoch": 5.3, + "learning_rate": 2.348689771766695e-05, + "loss": 0.5239, + "step": 6273, + "task_loss": 0.6233135461807251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15413682162761688, + "epoch": 5.3, + "learning_rate": 2.348267117497887e-05, + "loss": 0.333, + "step": 6274, + "task_loss": 0.1554541438817978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32594770193099976, + "epoch": 5.3, + "learning_rate": 2.347844463229079e-05, + "loss": 0.3904, + "step": 6275, + "task_loss": 0.08425401151180267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3967881202697754, + "epoch": 5.3, + "learning_rate": 2.3474218089602705e-05, + "loss": 0.4763, + "step": 6276, + "task_loss": 0.8253606557846069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6584041714668274, + "epoch": 5.31, + "learning_rate": 2.3469991546914625e-05, + "loss": 0.4435, + "step": 6277, + "task_loss": 1.5819460153579712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5217419862747192, + "epoch": 5.31, + "learning_rate": 2.3465765004226544e-05, + "loss": 0.5061, + "step": 6278, + "task_loss": 0.2556835114955902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4729047417640686, + "epoch": 5.31, + "learning_rate": 2.3461538461538464e-05, + "loss": 0.4692, + "step": 6279, + "task_loss": 0.2707321047782898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9134374260902405, + "epoch": 5.31, + "learning_rate": 2.345731191885038e-05, + "loss": 0.6427, + "step": 6280, + "task_loss": 1.277496337890625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.263666033744812, + "epoch": 5.31, + "learning_rate": 2.34530853761623e-05, + "loss": 0.3502, + "step": 6281, + "task_loss": 0.5824320912361145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4495238959789276, + "epoch": 5.31, + "learning_rate": 2.344885883347422e-05, + "loss": 0.4482, + "step": 6282, + "task_loss": 0.8620867729187012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2633923292160034, + "epoch": 5.31, + "learning_rate": 2.344463229078614e-05, + "loss": 0.4124, + "step": 6283, + "task_loss": 0.6501432657241821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.281352162361145, + "epoch": 5.31, + "learning_rate": 2.3440405748098056e-05, + "loss": 0.5196, + "step": 6284, + "task_loss": 0.9025052785873413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4177960157394409, + "epoch": 5.31, + "learning_rate": 2.3436179205409976e-05, + "loss": 0.4289, + "step": 6285, + "task_loss": 0.640766441822052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6713600158691406, + "epoch": 5.31, + "learning_rate": 2.3431952662721896e-05, + "loss": 0.5789, + "step": 6286, + "task_loss": 0.4753790497779846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38802266120910645, + "epoch": 5.31, + "learning_rate": 2.3427726120033812e-05, + "loss": 0.4026, + "step": 6287, + "task_loss": 0.218027725815773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4825308620929718, + "epoch": 5.32, + "learning_rate": 2.342349957734573e-05, + "loss": 0.4285, + "step": 6288, + "task_loss": 0.7239080667495728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25949832797050476, + "epoch": 5.32, + "learning_rate": 2.341927303465765e-05, + "loss": 0.3898, + "step": 6289, + "task_loss": 0.15181967616081238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5462753176689148, + "epoch": 5.32, + "learning_rate": 2.341504649196957e-05, + "loss": 0.5793, + "step": 6290, + "task_loss": 0.3417903482913971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4920628070831299, + "epoch": 5.32, + "learning_rate": 2.341081994928149e-05, + "loss": 0.471, + "step": 6291, + "task_loss": 0.855144739151001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31098222732543945, + "epoch": 5.32, + "learning_rate": 2.3406593406593407e-05, + "loss": 0.4058, + "step": 6292, + "task_loss": 0.5844261050224304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8183332085609436, + "epoch": 5.32, + "learning_rate": 2.3402366863905327e-05, + "loss": 0.4288, + "step": 6293, + "task_loss": 0.9094383716583252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6002986431121826, + "epoch": 5.32, + "learning_rate": 2.3398140321217247e-05, + "loss": 0.5538, + "step": 6294, + "task_loss": 0.46236011385917664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7307037115097046, + "epoch": 5.32, + "learning_rate": 2.3393913778529163e-05, + "loss": 0.5954, + "step": 6295, + "task_loss": 0.9432345628738403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5027535557746887, + "epoch": 5.32, + "learning_rate": 2.3389687235841083e-05, + "loss": 0.4874, + "step": 6296, + "task_loss": 0.6954266428947449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5634467601776123, + "epoch": 5.32, + "learning_rate": 2.3385460693153003e-05, + "loss": 0.5371, + "step": 6297, + "task_loss": 1.1679253578186035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24532440304756165, + "epoch": 5.32, + "learning_rate": 2.338123415046492e-05, + "loss": 0.3951, + "step": 6298, + "task_loss": 0.9199677109718323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9492353200912476, + "epoch": 5.32, + "learning_rate": 2.3377007607776842e-05, + "loss": 0.5881, + "step": 6299, + "task_loss": 1.0791666507720947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.473374605178833, + "epoch": 5.33, + "learning_rate": 2.337278106508876e-05, + "loss": 0.4744, + "step": 6300, + "task_loss": 0.5616335868835449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8260717391967773, + "epoch": 5.33, + "learning_rate": 2.3368554522400675e-05, + "loss": 0.6269, + "step": 6301, + "task_loss": 1.231544852256775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4669424593448639, + "epoch": 5.33, + "learning_rate": 2.3364327979712598e-05, + "loss": 0.5183, + "step": 6302, + "task_loss": 0.44171619415283203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4790842831134796, + "epoch": 5.33, + "learning_rate": 2.3360101437024514e-05, + "loss": 0.4442, + "step": 6303, + "task_loss": 1.087006688117981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.267011433839798, + "epoch": 5.33, + "learning_rate": 2.3355874894336434e-05, + "loss": 0.4156, + "step": 6304, + "task_loss": 1.3671830892562866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8638502359390259, + "epoch": 5.33, + "learning_rate": 2.3351648351648354e-05, + "loss": 0.6019, + "step": 6305, + "task_loss": 0.4426146149635315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5196127891540527, + "epoch": 5.33, + "learning_rate": 2.334742180896027e-05, + "loss": 0.58, + "step": 6306, + "task_loss": 0.6124235391616821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26033371686935425, + "epoch": 5.33, + "learning_rate": 2.334319526627219e-05, + "loss": 0.539, + "step": 6307, + "task_loss": 0.3434199392795563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5375725626945496, + "epoch": 5.33, + "learning_rate": 2.333896872358411e-05, + "loss": 0.5522, + "step": 6308, + "task_loss": 0.6082590222358704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5345404148101807, + "epoch": 5.33, + "learning_rate": 2.3334742180896026e-05, + "loss": 0.4097, + "step": 6309, + "task_loss": 1.2123430967330933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25966495275497437, + "epoch": 5.33, + "learning_rate": 2.333051563820795e-05, + "loss": 0.3246, + "step": 6310, + "task_loss": 0.02176729030907154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5519406795501709, + "epoch": 5.33, + "learning_rate": 2.3326289095519865e-05, + "loss": 0.5274, + "step": 6311, + "task_loss": 0.40845367312431335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.570057213306427, + "epoch": 5.34, + "learning_rate": 2.3322062552831785e-05, + "loss": 0.4373, + "step": 6312, + "task_loss": 0.3954927921295166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7295188903808594, + "epoch": 5.34, + "learning_rate": 2.3317836010143705e-05, + "loss": 0.4618, + "step": 6313, + "task_loss": 0.880217432975769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5778757929801941, + "epoch": 5.34, + "learning_rate": 2.331360946745562e-05, + "loss": 0.4438, + "step": 6314, + "task_loss": 0.27902382612228394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26342594623565674, + "epoch": 5.34, + "learning_rate": 2.330938292476754e-05, + "loss": 0.4656, + "step": 6315, + "task_loss": 0.7434739470481873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35087716579437256, + "epoch": 5.34, + "learning_rate": 2.330515638207946e-05, + "loss": 0.3003, + "step": 6316, + "task_loss": 0.21923521161079407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4444340467453003, + "epoch": 5.34, + "learning_rate": 2.3300929839391377e-05, + "loss": 0.4964, + "step": 6317, + "task_loss": 0.7905334234237671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26901328563690186, + "epoch": 5.34, + "learning_rate": 2.3296703296703297e-05, + "loss": 0.3457, + "step": 6318, + "task_loss": 0.08010376989841461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0238771438598633, + "epoch": 5.34, + "learning_rate": 2.3292476754015216e-05, + "loss": 0.6925, + "step": 6319, + "task_loss": 1.4084781408309937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7447539567947388, + "epoch": 5.34, + "learning_rate": 2.3288250211327136e-05, + "loss": 0.4913, + "step": 6320, + "task_loss": 0.694594144821167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3836894631385803, + "epoch": 5.34, + "learning_rate": 2.3284023668639056e-05, + "loss": 0.4298, + "step": 6321, + "task_loss": 1.0191599130630493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6567772030830383, + "epoch": 5.34, + "learning_rate": 2.3279797125950972e-05, + "loss": 0.541, + "step": 6322, + "task_loss": 1.4293559789657593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22209644317626953, + "epoch": 5.34, + "learning_rate": 2.3275570583262892e-05, + "loss": 0.362, + "step": 6323, + "task_loss": 0.21419844031333923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6108503341674805, + "epoch": 5.35, + "learning_rate": 2.3271344040574812e-05, + "loss": 0.4399, + "step": 6324, + "task_loss": 0.47167152166366577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6511044502258301, + "epoch": 5.35, + "learning_rate": 2.3267117497886728e-05, + "loss": 0.503, + "step": 6325, + "task_loss": 0.6655309796333313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5060898661613464, + "epoch": 5.35, + "learning_rate": 2.3262890955198648e-05, + "loss": 0.5004, + "step": 6326, + "task_loss": 0.23334598541259766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6852645874023438, + "epoch": 5.35, + "learning_rate": 2.3258664412510568e-05, + "loss": 0.5676, + "step": 6327, + "task_loss": 0.6163545846939087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.705136775970459, + "epoch": 5.35, + "learning_rate": 2.3254437869822487e-05, + "loss": 0.7197, + "step": 6328, + "task_loss": 0.3037494421005249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.907212495803833, + "epoch": 5.35, + "learning_rate": 2.3250211327134404e-05, + "loss": 0.6543, + "step": 6329, + "task_loss": 0.3265439569950104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7458125948905945, + "epoch": 5.35, + "learning_rate": 2.3245984784446323e-05, + "loss": 0.4454, + "step": 6330, + "task_loss": 0.8707532286643982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4769487977027893, + "epoch": 5.35, + "learning_rate": 2.3241758241758243e-05, + "loss": 0.5082, + "step": 6331, + "task_loss": 0.7781730890274048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4995899200439453, + "epoch": 5.35, + "learning_rate": 2.3237531699070163e-05, + "loss": 0.4806, + "step": 6332, + "task_loss": 0.6673713326454163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.275932252407074, + "epoch": 5.35, + "learning_rate": 2.3233305156382083e-05, + "loss": 0.3908, + "step": 6333, + "task_loss": 0.34814953804016113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37716493010520935, + "epoch": 5.35, + "learning_rate": 2.3229078613694e-05, + "loss": 0.4074, + "step": 6334, + "task_loss": 0.3317495584487915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5032260417938232, + "epoch": 5.35, + "learning_rate": 2.322485207100592e-05, + "loss": 0.4644, + "step": 6335, + "task_loss": 0.4129529595375061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39171987771987915, + "epoch": 5.36, + "learning_rate": 2.322062552831784e-05, + "loss": 0.3735, + "step": 6336, + "task_loss": 0.2407274693250656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5966951847076416, + "epoch": 5.36, + "learning_rate": 2.3216398985629755e-05, + "loss": 0.6551, + "step": 6337, + "task_loss": 1.4332427978515625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5214473605155945, + "epoch": 5.36, + "learning_rate": 2.3212172442941675e-05, + "loss": 0.5769, + "step": 6338, + "task_loss": 0.2481902688741684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6677607297897339, + "epoch": 5.36, + "learning_rate": 2.3207945900253594e-05, + "loss": 0.5079, + "step": 6339, + "task_loss": 0.8833622336387634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3551899790763855, + "epoch": 5.36, + "learning_rate": 2.320371935756551e-05, + "loss": 0.5046, + "step": 6340, + "task_loss": 0.6206568479537964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47297364473342896, + "epoch": 5.36, + "learning_rate": 2.3199492814877434e-05, + "loss": 0.481, + "step": 6341, + "task_loss": 1.128438115119934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49855107069015503, + "epoch": 5.36, + "learning_rate": 2.319526627218935e-05, + "loss": 0.7045, + "step": 6342, + "task_loss": 0.7372344136238098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4442156255245209, + "epoch": 5.36, + "learning_rate": 2.319103972950127e-05, + "loss": 0.5127, + "step": 6343, + "task_loss": 0.5850629210472107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29978296160697937, + "epoch": 5.36, + "learning_rate": 2.318681318681319e-05, + "loss": 0.5218, + "step": 6344, + "task_loss": 0.5273677706718445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28832143545150757, + "epoch": 5.36, + "learning_rate": 2.3182586644125106e-05, + "loss": 0.4649, + "step": 6345, + "task_loss": 0.34064310789108276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36183393001556396, + "epoch": 5.36, + "learning_rate": 2.3178360101437026e-05, + "loss": 0.3171, + "step": 6346, + "task_loss": 0.39563506841659546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4016374349594116, + "epoch": 5.36, + "learning_rate": 2.3174133558748945e-05, + "loss": 0.4561, + "step": 6347, + "task_loss": 1.013319969177246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44493088126182556, + "epoch": 5.37, + "learning_rate": 2.3169907016060862e-05, + "loss": 0.4621, + "step": 6348, + "task_loss": 0.6441196799278259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3102928698062897, + "epoch": 5.37, + "learning_rate": 2.3165680473372785e-05, + "loss": 0.4571, + "step": 6349, + "task_loss": 0.43051812052726746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4973911643028259, + "epoch": 5.37, + "learning_rate": 2.31614539306847e-05, + "loss": 0.5559, + "step": 6350, + "task_loss": 1.2376153469085693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4312673807144165, + "epoch": 5.37, + "learning_rate": 2.3157227387996618e-05, + "loss": 0.3635, + "step": 6351, + "task_loss": 0.6045324206352234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.556331217288971, + "epoch": 5.37, + "learning_rate": 2.315300084530854e-05, + "loss": 0.4133, + "step": 6352, + "task_loss": 0.2536722719669342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45933598279953003, + "epoch": 5.37, + "learning_rate": 2.3148774302620457e-05, + "loss": 0.5702, + "step": 6353, + "task_loss": 1.744941234588623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2884848415851593, + "epoch": 5.37, + "learning_rate": 2.3144547759932377e-05, + "loss": 0.5027, + "step": 6354, + "task_loss": 0.9973456859588623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5337940454483032, + "epoch": 5.37, + "learning_rate": 2.3140321217244297e-05, + "loss": 0.599, + "step": 6355, + "task_loss": 0.6655141711235046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5315665006637573, + "epoch": 5.37, + "learning_rate": 2.3136094674556213e-05, + "loss": 0.4786, + "step": 6356, + "task_loss": 0.5373191237449646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40978026390075684, + "epoch": 5.37, + "learning_rate": 2.3131868131868133e-05, + "loss": 0.4364, + "step": 6357, + "task_loss": 0.1383046805858612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3069595992565155, + "epoch": 5.37, + "learning_rate": 2.3127641589180052e-05, + "loss": 0.4057, + "step": 6358, + "task_loss": 0.37925341725349426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4968445897102356, + "epoch": 5.38, + "learning_rate": 2.312341504649197e-05, + "loss": 0.5779, + "step": 6359, + "task_loss": 0.6606655716896057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7478101849555969, + "epoch": 5.38, + "learning_rate": 2.3119188503803892e-05, + "loss": 0.5846, + "step": 6360, + "task_loss": 0.9579147100448608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7198728919029236, + "epoch": 5.38, + "learning_rate": 2.3114961961115808e-05, + "loss": 0.5251, + "step": 6361, + "task_loss": 0.5034919381141663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41016674041748047, + "epoch": 5.38, + "learning_rate": 2.3110735418427728e-05, + "loss": 0.4581, + "step": 6362, + "task_loss": 0.6901698708534241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4036819040775299, + "epoch": 5.38, + "learning_rate": 2.3106508875739648e-05, + "loss": 0.5376, + "step": 6363, + "task_loss": 0.5144193172454834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4146765470504761, + "epoch": 5.38, + "learning_rate": 2.3102282333051564e-05, + "loss": 0.4492, + "step": 6364, + "task_loss": 0.14593659341335297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3329591155052185, + "epoch": 5.38, + "learning_rate": 2.3098055790363484e-05, + "loss": 0.5064, + "step": 6365, + "task_loss": 0.1634732484817505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38214319944381714, + "epoch": 5.38, + "learning_rate": 2.3093829247675404e-05, + "loss": 0.4409, + "step": 6366, + "task_loss": 0.2822558581829071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43107059597969055, + "epoch": 5.38, + "learning_rate": 2.308960270498732e-05, + "loss": 0.5129, + "step": 6367, + "task_loss": 0.43987447023391724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4098028838634491, + "epoch": 5.38, + "learning_rate": 2.308537616229924e-05, + "loss": 0.5469, + "step": 6368, + "task_loss": 0.71338951587677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4446815848350525, + "epoch": 5.38, + "learning_rate": 2.308114961961116e-05, + "loss": 0.589, + "step": 6369, + "task_loss": 0.22819450497627258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36666053533554077, + "epoch": 5.38, + "learning_rate": 2.307692307692308e-05, + "loss": 0.5098, + "step": 6370, + "task_loss": 0.5681432485580444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4629097580909729, + "epoch": 5.39, + "learning_rate": 2.3072696534234995e-05, + "loss": 0.5048, + "step": 6371, + "task_loss": 0.5546885132789612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36749204993247986, + "epoch": 5.39, + "learning_rate": 2.3068469991546915e-05, + "loss": 0.4672, + "step": 6372, + "task_loss": 0.33122384548187256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4888591170310974, + "epoch": 5.39, + "learning_rate": 2.3064243448858835e-05, + "loss": 0.4286, + "step": 6373, + "task_loss": 0.506497323513031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5156115293502808, + "epoch": 5.39, + "learning_rate": 2.3060016906170755e-05, + "loss": 0.5264, + "step": 6374, + "task_loss": 0.8370395302772522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5073370933532715, + "epoch": 5.39, + "learning_rate": 2.305579036348267e-05, + "loss": 0.4431, + "step": 6375, + "task_loss": 0.4581719934940338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2702508866786957, + "epoch": 5.39, + "learning_rate": 2.305156382079459e-05, + "loss": 0.3415, + "step": 6376, + "task_loss": 0.702288806438446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.64216548204422, + "epoch": 5.39, + "learning_rate": 2.304733727810651e-05, + "loss": 0.3822, + "step": 6377, + "task_loss": 0.7829493284225464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5027613639831543, + "epoch": 5.39, + "learning_rate": 2.304311073541843e-05, + "loss": 0.6342, + "step": 6378, + "task_loss": 0.16487187147140503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7633986473083496, + "epoch": 5.39, + "learning_rate": 2.3038884192730347e-05, + "loss": 0.5252, + "step": 6379, + "task_loss": 0.7553760409355164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.526973307132721, + "epoch": 5.39, + "learning_rate": 2.3034657650042266e-05, + "loss": 0.4545, + "step": 6380, + "task_loss": 1.0537790060043335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5727398991584778, + "epoch": 5.39, + "learning_rate": 2.3030431107354186e-05, + "loss": 0.4678, + "step": 6381, + "task_loss": 0.44452589750289917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30572181940078735, + "epoch": 5.39, + "learning_rate": 2.3026204564666102e-05, + "loss": 0.3639, + "step": 6382, + "task_loss": 0.24077633023262024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44673413038253784, + "epoch": 5.4, + "learning_rate": 2.3021978021978022e-05, + "loss": 0.4376, + "step": 6383, + "task_loss": 0.34365618228912354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4417877793312073, + "epoch": 5.4, + "learning_rate": 2.3017751479289942e-05, + "loss": 0.5427, + "step": 6384, + "task_loss": 0.7422190308570862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9750477075576782, + "epoch": 5.4, + "learning_rate": 2.301352493660186e-05, + "loss": 0.5902, + "step": 6385, + "task_loss": 1.2108874320983887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36852145195007324, + "epoch": 5.4, + "learning_rate": 2.300929839391378e-05, + "loss": 0.4756, + "step": 6386, + "task_loss": 0.06643177568912506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2536177933216095, + "epoch": 5.4, + "learning_rate": 2.3005071851225698e-05, + "loss": 0.5338, + "step": 6387, + "task_loss": 0.5156050324440002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6774274706840515, + "epoch": 5.4, + "learning_rate": 2.3000845308537617e-05, + "loss": 0.5563, + "step": 6388, + "task_loss": 0.4827357828617096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4548439383506775, + "epoch": 5.4, + "learning_rate": 2.2996618765849537e-05, + "loss": 0.4951, + "step": 6389, + "task_loss": 0.19972564280033112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6481550335884094, + "epoch": 5.4, + "learning_rate": 2.2992392223161454e-05, + "loss": 0.5465, + "step": 6390, + "task_loss": 1.2912670373916626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.632424533367157, + "epoch": 5.4, + "learning_rate": 2.2988165680473377e-05, + "loss": 0.4956, + "step": 6391, + "task_loss": 0.7639898657798767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47841742634773254, + "epoch": 5.4, + "learning_rate": 2.2983939137785293e-05, + "loss": 0.4975, + "step": 6392, + "task_loss": 1.1369231939315796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4226456582546234, + "epoch": 5.4, + "learning_rate": 2.297971259509721e-05, + "loss": 0.5512, + "step": 6393, + "task_loss": 0.4186021387577057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5733023881912231, + "epoch": 5.4, + "learning_rate": 2.2975486052409132e-05, + "loss": 0.5018, + "step": 6394, + "task_loss": 0.7421029806137085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6429904699325562, + "epoch": 5.41, + "learning_rate": 2.297125950972105e-05, + "loss": 0.4053, + "step": 6395, + "task_loss": 0.5905466079711914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5939153432846069, + "epoch": 5.41, + "learning_rate": 2.296703296703297e-05, + "loss": 0.504, + "step": 6396, + "task_loss": 0.969499409198761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4579850137233734, + "epoch": 5.41, + "learning_rate": 2.2962806424344888e-05, + "loss": 0.6099, + "step": 6397, + "task_loss": 1.0757176876068115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4377705454826355, + "epoch": 5.41, + "learning_rate": 2.2958579881656805e-05, + "loss": 0.4037, + "step": 6398, + "task_loss": 0.11609987169504166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33455055952072144, + "epoch": 5.41, + "learning_rate": 2.2954353338968724e-05, + "loss": 0.5197, + "step": 6399, + "task_loss": 0.9170243144035339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3963354229927063, + "epoch": 5.41, + "learning_rate": 2.2950126796280644e-05, + "loss": 0.4247, + "step": 6400, + "task_loss": 0.5958443284034729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2776285409927368, + "epoch": 5.41, + "learning_rate": 2.294590025359256e-05, + "loss": 0.4238, + "step": 6401, + "task_loss": 0.402399480342865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3305109143257141, + "epoch": 5.41, + "learning_rate": 2.2941673710904484e-05, + "loss": 0.5602, + "step": 6402, + "task_loss": 0.2626878619194031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42549020051956177, + "epoch": 5.41, + "learning_rate": 2.29374471682164e-05, + "loss": 0.5939, + "step": 6403, + "task_loss": 0.7085230350494385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3904922604560852, + "epoch": 5.41, + "learning_rate": 2.2933220625528316e-05, + "loss": 0.3681, + "step": 6404, + "task_loss": 0.669359028339386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4589058458805084, + "epoch": 5.41, + "learning_rate": 2.292899408284024e-05, + "loss": 0.4913, + "step": 6405, + "task_loss": 0.1431789994239807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2585071325302124, + "epoch": 5.41, + "learning_rate": 2.2924767540152156e-05, + "loss": 0.4226, + "step": 6406, + "task_loss": 0.5096858143806458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6571400761604309, + "epoch": 5.42, + "learning_rate": 2.2920540997464076e-05, + "loss": 0.5796, + "step": 6407, + "task_loss": 0.9306409955024719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5002869963645935, + "epoch": 5.42, + "learning_rate": 2.2916314454775995e-05, + "loss": 0.362, + "step": 6408, + "task_loss": 0.377849280834198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27194857597351074, + "epoch": 5.42, + "learning_rate": 2.291208791208791e-05, + "loss": 0.3585, + "step": 6409, + "task_loss": 0.12705577909946442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48667553067207336, + "epoch": 5.42, + "learning_rate": 2.290786136939983e-05, + "loss": 0.4944, + "step": 6410, + "task_loss": 0.9912238121032715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6095097064971924, + "epoch": 5.42, + "learning_rate": 2.290363482671175e-05, + "loss": 0.5013, + "step": 6411, + "task_loss": 0.6767289638519287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7189763784408569, + "epoch": 5.42, + "learning_rate": 2.2899408284023667e-05, + "loss": 0.686, + "step": 6412, + "task_loss": 0.9101861119270325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4065517783164978, + "epoch": 5.42, + "learning_rate": 2.289518174133559e-05, + "loss": 0.3953, + "step": 6413, + "task_loss": 0.7958052754402161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6466049551963806, + "epoch": 5.42, + "learning_rate": 2.2890955198647507e-05, + "loss": 0.5393, + "step": 6414, + "task_loss": 1.3449606895446777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36048945784568787, + "epoch": 5.42, + "learning_rate": 2.2886728655959427e-05, + "loss": 0.4303, + "step": 6415, + "task_loss": 0.3475267291069031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5222697257995605, + "epoch": 5.42, + "learning_rate": 2.2882502113271346e-05, + "loss": 0.5119, + "step": 6416, + "task_loss": 2.133908271789551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.481108158826828, + "epoch": 5.42, + "learning_rate": 2.2878275570583263e-05, + "loss": 0.5032, + "step": 6417, + "task_loss": 0.523410975933075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42595553398132324, + "epoch": 5.42, + "learning_rate": 2.2874049027895182e-05, + "loss": 0.47, + "step": 6418, + "task_loss": 0.4274257719516754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3975520730018616, + "epoch": 5.43, + "learning_rate": 2.2869822485207102e-05, + "loss": 0.4003, + "step": 6419, + "task_loss": 0.5490153431892395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18036630749702454, + "epoch": 5.43, + "learning_rate": 2.286559594251902e-05, + "loss": 0.4487, + "step": 6420, + "task_loss": 0.05080006644129753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4209897518157959, + "epoch": 5.43, + "learning_rate": 2.286136939983094e-05, + "loss": 0.441, + "step": 6421, + "task_loss": 1.0873464345932007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.790572464466095, + "epoch": 5.43, + "learning_rate": 2.2857142857142858e-05, + "loss": 0.5389, + "step": 6422, + "task_loss": 0.38755011558532715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48761221766471863, + "epoch": 5.43, + "learning_rate": 2.2852916314454778e-05, + "loss": 0.6122, + "step": 6423, + "task_loss": 0.9050610661506653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4309987723827362, + "epoch": 5.43, + "learning_rate": 2.2848689771766698e-05, + "loss": 0.5794, + "step": 6424, + "task_loss": 0.9355601668357849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5323224067687988, + "epoch": 5.43, + "learning_rate": 2.2844463229078614e-05, + "loss": 0.493, + "step": 6425, + "task_loss": 0.3420991599559784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.332425594329834, + "epoch": 5.43, + "learning_rate": 2.2840236686390534e-05, + "loss": 0.5334, + "step": 6426, + "task_loss": 0.8157430291175842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5683882236480713, + "epoch": 5.43, + "learning_rate": 2.2836010143702453e-05, + "loss": 0.5076, + "step": 6427, + "task_loss": 0.71019047498703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4162535071372986, + "epoch": 5.43, + "learning_rate": 2.2831783601014373e-05, + "loss": 0.4171, + "step": 6428, + "task_loss": 0.23452575504779816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49199119210243225, + "epoch": 5.43, + "learning_rate": 2.282755705832629e-05, + "loss": 0.5298, + "step": 6429, + "task_loss": 0.3094976544380188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3522765040397644, + "epoch": 5.44, + "learning_rate": 2.282333051563821e-05, + "loss": 0.3936, + "step": 6430, + "task_loss": 0.35985374450683594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.336675226688385, + "epoch": 5.44, + "learning_rate": 2.281910397295013e-05, + "loss": 0.5095, + "step": 6431, + "task_loss": 0.06399393081665039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3393029570579529, + "epoch": 5.44, + "learning_rate": 2.2814877430262045e-05, + "loss": 0.3491, + "step": 6432, + "task_loss": 0.16699552536010742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36589372158050537, + "epoch": 5.44, + "learning_rate": 2.2810650887573965e-05, + "loss": 0.414, + "step": 6433, + "task_loss": 0.5629777312278748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23177243769168854, + "epoch": 5.44, + "learning_rate": 2.2806424344885885e-05, + "loss": 0.451, + "step": 6434, + "task_loss": 0.6587018966674805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38670265674591064, + "epoch": 5.44, + "learning_rate": 2.28021978021978e-05, + "loss": 0.389, + "step": 6435, + "task_loss": 0.9098807573318481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31930696964263916, + "epoch": 5.44, + "learning_rate": 2.2797971259509724e-05, + "loss": 0.3419, + "step": 6436, + "task_loss": 0.6778592467308044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3730270266532898, + "epoch": 5.44, + "learning_rate": 2.279374471682164e-05, + "loss": 0.4483, + "step": 6437, + "task_loss": 0.8430075645446777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26599180698394775, + "epoch": 5.44, + "learning_rate": 2.278951817413356e-05, + "loss": 0.4399, + "step": 6438, + "task_loss": 0.05670448765158653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4763574004173279, + "epoch": 5.44, + "learning_rate": 2.278529163144548e-05, + "loss": 0.403, + "step": 6439, + "task_loss": 0.1193678081035614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.634314775466919, + "epoch": 5.44, + "learning_rate": 2.2781065088757396e-05, + "loss": 0.6364, + "step": 6440, + "task_loss": 0.6560878157615662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4641571342945099, + "epoch": 5.44, + "learning_rate": 2.2776838546069316e-05, + "loss": 0.4685, + "step": 6441, + "task_loss": 1.5128400325775146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4815780818462372, + "epoch": 5.45, + "learning_rate": 2.2772612003381236e-05, + "loss": 0.4287, + "step": 6442, + "task_loss": 1.0178754329681396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5637037754058838, + "epoch": 5.45, + "learning_rate": 2.2768385460693152e-05, + "loss": 0.5423, + "step": 6443, + "task_loss": 0.3484659790992737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4281385540962219, + "epoch": 5.45, + "learning_rate": 2.2764158918005075e-05, + "loss": 0.4568, + "step": 6444, + "task_loss": 1.363336443901062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48757416009902954, + "epoch": 5.45, + "learning_rate": 2.2759932375316992e-05, + "loss": 0.6214, + "step": 6445, + "task_loss": 0.47412046790122986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30364447832107544, + "epoch": 5.45, + "learning_rate": 2.2755705832628908e-05, + "loss": 0.5425, + "step": 6446, + "task_loss": 0.5810065865516663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6672943830490112, + "epoch": 5.45, + "learning_rate": 2.275147928994083e-05, + "loss": 0.6902, + "step": 6447, + "task_loss": 0.23105435073375702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2524298429489136, + "epoch": 5.45, + "learning_rate": 2.2747252747252748e-05, + "loss": 0.4326, + "step": 6448, + "task_loss": 0.579328715801239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41499415040016174, + "epoch": 5.45, + "learning_rate": 2.2743026204564667e-05, + "loss": 0.5255, + "step": 6449, + "task_loss": 0.5841841101646423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.4481174945831299, + "epoch": 5.45, + "learning_rate": 2.2738799661876587e-05, + "loss": 0.6774, + "step": 6450, + "task_loss": 1.411600947380066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47486603260040283, + "epoch": 5.45, + "learning_rate": 2.2734573119188503e-05, + "loss": 0.5062, + "step": 6451, + "task_loss": 0.6447728872299194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26596134901046753, + "epoch": 5.45, + "learning_rate": 2.2730346576500423e-05, + "loss": 0.4111, + "step": 6452, + "task_loss": 0.30093953013420105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35910356044769287, + "epoch": 5.45, + "learning_rate": 2.2726120033812343e-05, + "loss": 0.4036, + "step": 6453, + "task_loss": 0.8828761577606201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43394482135772705, + "epoch": 5.46, + "learning_rate": 2.272189349112426e-05, + "loss": 0.5238, + "step": 6454, + "task_loss": 0.6709728240966797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6335655450820923, + "epoch": 5.46, + "learning_rate": 2.2717666948436182e-05, + "loss": 0.5635, + "step": 6455, + "task_loss": 0.4928433895111084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4067826271057129, + "epoch": 5.46, + "learning_rate": 2.27134404057481e-05, + "loss": 0.5514, + "step": 6456, + "task_loss": 0.576821506023407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3135641813278198, + "epoch": 5.46, + "learning_rate": 2.270921386306002e-05, + "loss": 0.369, + "step": 6457, + "task_loss": 0.6777243614196777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.745387077331543, + "epoch": 5.46, + "learning_rate": 2.2704987320371938e-05, + "loss": 0.4971, + "step": 6458, + "task_loss": 0.9218758940696716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4622109532356262, + "epoch": 5.46, + "learning_rate": 2.2700760777683854e-05, + "loss": 0.577, + "step": 6459, + "task_loss": 0.5221714973449707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4771652817726135, + "epoch": 5.46, + "learning_rate": 2.2696534234995774e-05, + "loss": 0.5078, + "step": 6460, + "task_loss": 0.6215834617614746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4503202736377716, + "epoch": 5.46, + "learning_rate": 2.2692307692307694e-05, + "loss": 0.4179, + "step": 6461, + "task_loss": 1.5685396194458008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5257421135902405, + "epoch": 5.46, + "learning_rate": 2.268808114961961e-05, + "loss": 0.3899, + "step": 6462, + "task_loss": 0.38245683908462524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39943039417266846, + "epoch": 5.46, + "learning_rate": 2.268385460693153e-05, + "loss": 0.5771, + "step": 6463, + "task_loss": 0.45121529698371887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.540168046951294, + "epoch": 5.46, + "learning_rate": 2.267962806424345e-05, + "loss": 0.5128, + "step": 6464, + "task_loss": 1.0453928709030151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30195993185043335, + "epoch": 5.46, + "learning_rate": 2.267540152155537e-05, + "loss": 0.4729, + "step": 6465, + "task_loss": 1.2784377336502075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35184744000434875, + "epoch": 5.47, + "learning_rate": 2.267117497886729e-05, + "loss": 0.4552, + "step": 6466, + "task_loss": 0.32141849398612976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36295440793037415, + "epoch": 5.47, + "learning_rate": 2.2666948436179206e-05, + "loss": 0.4963, + "step": 6467, + "task_loss": 1.4009230136871338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23777607083320618, + "epoch": 5.47, + "learning_rate": 2.2662721893491125e-05, + "loss": 0.4242, + "step": 6468, + "task_loss": 0.2750861644744873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7977490425109863, + "epoch": 5.47, + "learning_rate": 2.2658495350803045e-05, + "loss": 0.6131, + "step": 6469, + "task_loss": 1.0260581970214844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41261765360832214, + "epoch": 5.47, + "learning_rate": 2.265426880811496e-05, + "loss": 0.4611, + "step": 6470, + "task_loss": 0.2921566665172577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6386584043502808, + "epoch": 5.47, + "learning_rate": 2.265004226542688e-05, + "loss": 0.5262, + "step": 6471, + "task_loss": 0.9740623235702515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6597335338592529, + "epoch": 5.47, + "learning_rate": 2.26458157227388e-05, + "loss": 0.5375, + "step": 6472, + "task_loss": 0.4871094226837158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47870510816574097, + "epoch": 5.47, + "learning_rate": 2.264158918005072e-05, + "loss": 0.4089, + "step": 6473, + "task_loss": 1.3056256771087646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41450589895248413, + "epoch": 5.47, + "learning_rate": 2.2637362637362637e-05, + "loss": 0.5575, + "step": 6474, + "task_loss": 1.1317529678344727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2943868935108185, + "epoch": 5.47, + "learning_rate": 2.2633136094674557e-05, + "loss": 0.4193, + "step": 6475, + "task_loss": 0.5493542551994324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.515693724155426, + "epoch": 5.47, + "learning_rate": 2.2628909551986476e-05, + "loss": 0.4847, + "step": 6476, + "task_loss": 0.7433851361274719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5433123111724854, + "epoch": 5.47, + "learning_rate": 2.2624683009298396e-05, + "loss": 0.6244, + "step": 6477, + "task_loss": 0.7271904349327087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5849413275718689, + "epoch": 5.48, + "learning_rate": 2.2620456466610313e-05, + "loss": 0.6052, + "step": 6478, + "task_loss": 0.9450557827949524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3629588186740875, + "epoch": 5.48, + "learning_rate": 2.2616229923922232e-05, + "loss": 0.385, + "step": 6479, + "task_loss": 0.15342813730239868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33997422456741333, + "epoch": 5.48, + "learning_rate": 2.2612003381234152e-05, + "loss": 0.4322, + "step": 6480, + "task_loss": 0.2239326536655426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28108736872673035, + "epoch": 5.48, + "learning_rate": 2.2607776838546072e-05, + "loss": 0.4186, + "step": 6481, + "task_loss": 0.5179224014282227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3189160227775574, + "epoch": 5.48, + "learning_rate": 2.2603550295857988e-05, + "loss": 0.5085, + "step": 6482, + "task_loss": 0.9262568950653076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5939887762069702, + "epoch": 5.48, + "learning_rate": 2.2599323753169908e-05, + "loss": 0.5062, + "step": 6483, + "task_loss": 1.4437553882598877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5564907789230347, + "epoch": 5.48, + "learning_rate": 2.2595097210481828e-05, + "loss": 0.4729, + "step": 6484, + "task_loss": 1.0463241338729858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41006043553352356, + "epoch": 5.48, + "learning_rate": 2.2590870667793744e-05, + "loss": 0.3606, + "step": 6485, + "task_loss": 1.1540050506591797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38574209809303284, + "epoch": 5.48, + "learning_rate": 2.2586644125105667e-05, + "loss": 0.6044, + "step": 6486, + "task_loss": 0.33480316400527954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45560991764068604, + "epoch": 5.48, + "learning_rate": 2.2582417582417583e-05, + "loss": 0.4519, + "step": 6487, + "task_loss": 0.7456464767456055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2866612374782562, + "epoch": 5.48, + "learning_rate": 2.2578191039729503e-05, + "loss": 0.5121, + "step": 6488, + "task_loss": 0.02357068657875061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8256540298461914, + "epoch": 5.48, + "learning_rate": 2.2573964497041423e-05, + "loss": 0.5862, + "step": 6489, + "task_loss": 1.5655720233917236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4515344500541687, + "epoch": 5.49, + "learning_rate": 2.256973795435334e-05, + "loss": 0.4042, + "step": 6490, + "task_loss": 0.5995666980743408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3027622699737549, + "epoch": 5.49, + "learning_rate": 2.256551141166526e-05, + "loss": 0.5452, + "step": 6491, + "task_loss": 1.830371379852295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4637233018875122, + "epoch": 5.49, + "learning_rate": 2.256128486897718e-05, + "loss": 0.4726, + "step": 6492, + "task_loss": 0.8471509218215942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6204744577407837, + "epoch": 5.49, + "learning_rate": 2.2557058326289095e-05, + "loss": 0.5464, + "step": 6493, + "task_loss": 0.5459375977516174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25900793075561523, + "epoch": 5.49, + "learning_rate": 2.2552831783601018e-05, + "loss": 0.4367, + "step": 6494, + "task_loss": 0.804066002368927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4149196147918701, + "epoch": 5.49, + "learning_rate": 2.2548605240912935e-05, + "loss": 0.448, + "step": 6495, + "task_loss": 0.5030509829521179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6226173639297485, + "epoch": 5.49, + "learning_rate": 2.254437869822485e-05, + "loss": 0.5646, + "step": 6496, + "task_loss": 1.243924617767334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4392818808555603, + "epoch": 5.49, + "learning_rate": 2.2540152155536774e-05, + "loss": 0.4997, + "step": 6497, + "task_loss": 0.5331965684890747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48184460401535034, + "epoch": 5.49, + "learning_rate": 2.253592561284869e-05, + "loss": 0.4324, + "step": 6498, + "task_loss": 1.0983445644378662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5859066247940063, + "epoch": 5.49, + "learning_rate": 2.2531699070160607e-05, + "loss": 0.5205, + "step": 6499, + "task_loss": 1.3031401634216309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5207016468048096, + "epoch": 5.49, + "learning_rate": 2.252747252747253e-05, + "loss": 0.4405, + "step": 6500, + "task_loss": 0.810382068157196 + }, + { + "epoch": 5.49, + "eval_accuracy": 0.9071287128712872, + "eval_loss": 0.31121769547462463, + "eval_runtime": 230.0383, + "eval_samples_per_second": 109.764, + "eval_steps_per_second": 0.861, + "step": 6500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25616952776908875, + "epoch": 5.5, + "learning_rate": 2.2523245984784446e-05, + "loss": 0.4002, + "step": 6501, + "task_loss": 0.6353225708007812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36745402216911316, + "epoch": 5.5, + "learning_rate": 2.2519019442096366e-05, + "loss": 0.4508, + "step": 6502, + "task_loss": 0.6705199480056763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2786528170108795, + "epoch": 5.5, + "learning_rate": 2.2514792899408286e-05, + "loss": 0.5466, + "step": 6503, + "task_loss": 0.25337180495262146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5456166863441467, + "epoch": 5.5, + "learning_rate": 2.2510566356720202e-05, + "loss": 0.5253, + "step": 6504, + "task_loss": 0.6149640679359436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5985019207000732, + "epoch": 5.5, + "learning_rate": 2.2506339814032125e-05, + "loss": 0.4844, + "step": 6505, + "task_loss": 0.5040889382362366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4197734594345093, + "epoch": 5.5, + "learning_rate": 2.250211327134404e-05, + "loss": 0.4802, + "step": 6506, + "task_loss": 0.9863604307174683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3264505863189697, + "epoch": 5.5, + "learning_rate": 2.2497886728655958e-05, + "loss": 0.4333, + "step": 6507, + "task_loss": 0.5870899558067322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48236167430877686, + "epoch": 5.5, + "learning_rate": 2.249366018596788e-05, + "loss": 0.4413, + "step": 6508, + "task_loss": 0.2647905945777893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5345989465713501, + "epoch": 5.5, + "learning_rate": 2.2489433643279797e-05, + "loss": 0.5486, + "step": 6509, + "task_loss": 0.472306489944458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47498559951782227, + "epoch": 5.5, + "learning_rate": 2.2485207100591717e-05, + "loss": 0.42, + "step": 6510, + "task_loss": 0.5003288388252258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3009692430496216, + "epoch": 5.5, + "learning_rate": 2.2480980557903637e-05, + "loss": 0.3826, + "step": 6511, + "task_loss": 0.38763153553009033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4064667522907257, + "epoch": 5.5, + "learning_rate": 2.2476754015215553e-05, + "loss": 0.5453, + "step": 6512, + "task_loss": 0.6678287386894226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5118426084518433, + "epoch": 5.51, + "learning_rate": 2.2472527472527473e-05, + "loss": 0.4578, + "step": 6513, + "task_loss": 0.7778018712997437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4149852991104126, + "epoch": 5.51, + "learning_rate": 2.2468300929839393e-05, + "loss": 0.586, + "step": 6514, + "task_loss": 0.5796111822128296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18042577803134918, + "epoch": 5.51, + "learning_rate": 2.2464074387151312e-05, + "loss": 0.3526, + "step": 6515, + "task_loss": 0.4205179810523987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4085802733898163, + "epoch": 5.51, + "learning_rate": 2.245984784446323e-05, + "loss": 0.3893, + "step": 6516, + "task_loss": 0.22515836358070374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3967309296131134, + "epoch": 5.51, + "learning_rate": 2.245562130177515e-05, + "loss": 0.4296, + "step": 6517, + "task_loss": 1.1171687841415405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8369637727737427, + "epoch": 5.51, + "learning_rate": 2.2451394759087068e-05, + "loss": 0.4437, + "step": 6518, + "task_loss": 0.5157053470611572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4168230891227722, + "epoch": 5.51, + "learning_rate": 2.2447168216398988e-05, + "loss": 0.6323, + "step": 6519, + "task_loss": 0.486141562461853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5836775302886963, + "epoch": 5.51, + "learning_rate": 2.2442941673710904e-05, + "loss": 0.5197, + "step": 6520, + "task_loss": 0.7917031049728394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5544719099998474, + "epoch": 5.51, + "learning_rate": 2.2438715131022824e-05, + "loss": 0.4892, + "step": 6521, + "task_loss": 0.7600495219230652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28418242931365967, + "epoch": 5.51, + "learning_rate": 2.2434488588334744e-05, + "loss": 0.3287, + "step": 6522, + "task_loss": 0.5098791122436523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4191673994064331, + "epoch": 5.51, + "learning_rate": 2.2430262045646664e-05, + "loss": 0.4697, + "step": 6523, + "task_loss": 0.8762914538383484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41490814089775085, + "epoch": 5.51, + "learning_rate": 2.242603550295858e-05, + "loss": 0.5809, + "step": 6524, + "task_loss": 0.2511128783226013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4627256989479065, + "epoch": 5.52, + "learning_rate": 2.24218089602705e-05, + "loss": 0.4896, + "step": 6525, + "task_loss": 1.1867831945419312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4519067704677582, + "epoch": 5.52, + "learning_rate": 2.241758241758242e-05, + "loss": 0.466, + "step": 6526, + "task_loss": 0.45990416407585144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2885536849498749, + "epoch": 5.52, + "learning_rate": 2.2413355874894336e-05, + "loss": 0.3829, + "step": 6527, + "task_loss": 0.2812303900718689 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6543110013008118, + "epoch": 5.52, + "learning_rate": 2.2409129332206255e-05, + "loss": 0.5056, + "step": 6528, + "task_loss": 0.6204474568367004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4903867840766907, + "epoch": 5.52, + "learning_rate": 2.2404902789518175e-05, + "loss": 0.477, + "step": 6529, + "task_loss": 0.31804174184799194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5285801887512207, + "epoch": 5.52, + "learning_rate": 2.2400676246830095e-05, + "loss": 0.5004, + "step": 6530, + "task_loss": 0.8360893130302429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7006930708885193, + "epoch": 5.52, + "learning_rate": 2.2396449704142015e-05, + "loss": 0.6388, + "step": 6531, + "task_loss": 1.1461968421936035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33204856514930725, + "epoch": 5.52, + "learning_rate": 2.239222316145393e-05, + "loss": 0.3987, + "step": 6532, + "task_loss": 0.475440114736557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4505437910556793, + "epoch": 5.52, + "learning_rate": 2.238799661876585e-05, + "loss": 0.4337, + "step": 6533, + "task_loss": 1.1317940950393677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21005533635616302, + "epoch": 5.52, + "learning_rate": 2.238377007607777e-05, + "loss": 0.3488, + "step": 6534, + "task_loss": 0.0205268282443285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5093111395835876, + "epoch": 5.52, + "learning_rate": 2.2379543533389687e-05, + "loss": 0.4081, + "step": 6535, + "task_loss": 0.7224283814430237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5326557159423828, + "epoch": 5.52, + "learning_rate": 2.2375316990701607e-05, + "loss": 0.6215, + "step": 6536, + "task_loss": 0.4654531478881836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5011619925498962, + "epoch": 5.53, + "learning_rate": 2.2371090448013526e-05, + "loss": 0.4852, + "step": 6537, + "task_loss": 0.4804128408432007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3525070250034332, + "epoch": 5.53, + "learning_rate": 2.2366863905325443e-05, + "loss": 0.5048, + "step": 6538, + "task_loss": 0.975352942943573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3623230457305908, + "epoch": 5.53, + "learning_rate": 2.2362637362637366e-05, + "loss": 0.433, + "step": 6539, + "task_loss": 0.3931300640106201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6981106996536255, + "epoch": 5.53, + "learning_rate": 2.2358410819949282e-05, + "loss": 0.4037, + "step": 6540, + "task_loss": 1.199479579925537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4749060869216919, + "epoch": 5.53, + "learning_rate": 2.2354184277261202e-05, + "loss": 0.5002, + "step": 6541, + "task_loss": 1.0661007165908813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32553720474243164, + "epoch": 5.53, + "learning_rate": 2.234995773457312e-05, + "loss": 0.3613, + "step": 6542, + "task_loss": 0.11945799738168716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4936124086380005, + "epoch": 5.53, + "learning_rate": 2.2345731191885038e-05, + "loss": 0.4778, + "step": 6543, + "task_loss": 0.5682523846626282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5230497717857361, + "epoch": 5.53, + "learning_rate": 2.2341504649196958e-05, + "loss": 0.4574, + "step": 6544, + "task_loss": 0.9144527912139893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28021690249443054, + "epoch": 5.53, + "learning_rate": 2.2337278106508877e-05, + "loss": 0.4136, + "step": 6545, + "task_loss": 0.5678066611289978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5350494980812073, + "epoch": 5.53, + "learning_rate": 2.2333051563820794e-05, + "loss": 0.5417, + "step": 6546, + "task_loss": 1.476010799407959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3645980656147003, + "epoch": 5.53, + "learning_rate": 2.2328825021132717e-05, + "loss": 0.5291, + "step": 6547, + "task_loss": 0.3196815252304077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22159670293331146, + "epoch": 5.53, + "learning_rate": 2.2324598478444633e-05, + "loss": 0.3842, + "step": 6548, + "task_loss": 0.06847655028104782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.542724609375, + "epoch": 5.54, + "learning_rate": 2.232037193575655e-05, + "loss": 0.5366, + "step": 6549, + "task_loss": 0.8789463043212891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5722472667694092, + "epoch": 5.54, + "learning_rate": 2.2316145393068473e-05, + "loss": 0.5189, + "step": 6550, + "task_loss": 1.522711992263794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4096110463142395, + "epoch": 5.54, + "learning_rate": 2.231191885038039e-05, + "loss": 0.4056, + "step": 6551, + "task_loss": 1.3393573760986328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.209906667470932, + "epoch": 5.54, + "learning_rate": 2.230769230769231e-05, + "loss": 0.4308, + "step": 6552, + "task_loss": 0.4893885850906372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3922348618507385, + "epoch": 5.54, + "learning_rate": 2.230346576500423e-05, + "loss": 0.5648, + "step": 6553, + "task_loss": 0.4997672736644745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49192938208580017, + "epoch": 5.54, + "learning_rate": 2.2299239222316145e-05, + "loss": 0.5148, + "step": 6554, + "task_loss": 0.7971968054771423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48135387897491455, + "epoch": 5.54, + "learning_rate": 2.2295012679628065e-05, + "loss": 0.4434, + "step": 6555, + "task_loss": 1.213742971420288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31976959109306335, + "epoch": 5.54, + "learning_rate": 2.2290786136939984e-05, + "loss": 0.4681, + "step": 6556, + "task_loss": 0.4441589415073395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37154245376586914, + "epoch": 5.54, + "learning_rate": 2.22865595942519e-05, + "loss": 0.433, + "step": 6557, + "task_loss": 1.0343756675720215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5619939565658569, + "epoch": 5.54, + "learning_rate": 2.2282333051563824e-05, + "loss": 0.5246, + "step": 6558, + "task_loss": 0.6676798462867737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2783992886543274, + "epoch": 5.54, + "learning_rate": 2.227810650887574e-05, + "loss": 0.3707, + "step": 6559, + "task_loss": 0.610399603843689 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.769432783126831, + "epoch": 5.54, + "learning_rate": 2.227387996618766e-05, + "loss": 0.4293, + "step": 6560, + "task_loss": 1.2190186977386475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5816839933395386, + "epoch": 5.55, + "learning_rate": 2.226965342349958e-05, + "loss": 0.5375, + "step": 6561, + "task_loss": 0.6093623042106628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2690340280532837, + "epoch": 5.55, + "learning_rate": 2.2265426880811496e-05, + "loss": 0.3431, + "step": 6562, + "task_loss": 0.23161719739437103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46483105421066284, + "epoch": 5.55, + "learning_rate": 2.2261200338123416e-05, + "loss": 0.4134, + "step": 6563, + "task_loss": 0.7454406023025513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36716657876968384, + "epoch": 5.55, + "learning_rate": 2.2256973795435336e-05, + "loss": 0.534, + "step": 6564, + "task_loss": 0.16079899668693542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.398648202419281, + "epoch": 5.55, + "learning_rate": 2.2252747252747252e-05, + "loss": 0.4391, + "step": 6565, + "task_loss": 1.6294955015182495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44931724667549133, + "epoch": 5.55, + "learning_rate": 2.224852071005917e-05, + "loss": 0.4155, + "step": 6566, + "task_loss": 0.5449956655502319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5374575853347778, + "epoch": 5.55, + "learning_rate": 2.224429416737109e-05, + "loss": 0.3974, + "step": 6567, + "task_loss": 0.9460140466690063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48534244298934937, + "epoch": 5.55, + "learning_rate": 2.224006762468301e-05, + "loss": 0.5355, + "step": 6568, + "task_loss": 0.6801758408546448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49755018949508667, + "epoch": 5.55, + "learning_rate": 2.223584108199493e-05, + "loss": 0.5291, + "step": 6569, + "task_loss": 0.1559273600578308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6028162240982056, + "epoch": 5.55, + "learning_rate": 2.2231614539306847e-05, + "loss": 0.5084, + "step": 6570, + "task_loss": 1.3433688879013062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6831220388412476, + "epoch": 5.55, + "learning_rate": 2.2227387996618767e-05, + "loss": 0.6116, + "step": 6571, + "task_loss": 1.560497522354126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4108806848526001, + "epoch": 5.56, + "learning_rate": 2.2223161453930687e-05, + "loss": 0.4632, + "step": 6572, + "task_loss": 1.8107231855392456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6039530634880066, + "epoch": 5.56, + "learning_rate": 2.2218934911242606e-05, + "loss": 0.4891, + "step": 6573, + "task_loss": 0.44830143451690674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3714635968208313, + "epoch": 5.56, + "learning_rate": 2.2214708368554523e-05, + "loss": 0.416, + "step": 6574, + "task_loss": 0.8233501315116882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4858923852443695, + "epoch": 5.56, + "learning_rate": 2.2210481825866443e-05, + "loss": 0.4304, + "step": 6575, + "task_loss": 0.8596088886260986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43654894828796387, + "epoch": 5.56, + "learning_rate": 2.2206255283178362e-05, + "loss": 0.5777, + "step": 6576, + "task_loss": 0.2936279773712158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4588269591331482, + "epoch": 5.56, + "learning_rate": 2.220202874049028e-05, + "loss": 0.4125, + "step": 6577, + "task_loss": 0.40271538496017456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4263407588005066, + "epoch": 5.56, + "learning_rate": 2.21978021978022e-05, + "loss": 0.4968, + "step": 6578, + "task_loss": 0.8342435956001282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3663595914840698, + "epoch": 5.56, + "learning_rate": 2.2193575655114118e-05, + "loss": 0.3902, + "step": 6579, + "task_loss": 0.8362998366355896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4083057641983032, + "epoch": 5.56, + "learning_rate": 2.2189349112426034e-05, + "loss": 0.3909, + "step": 6580, + "task_loss": 0.3547692894935608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36761537194252014, + "epoch": 5.56, + "learning_rate": 2.2185122569737958e-05, + "loss": 0.4281, + "step": 6581, + "task_loss": 0.44335564970970154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6638447046279907, + "epoch": 5.56, + "learning_rate": 2.2180896027049874e-05, + "loss": 0.5134, + "step": 6582, + "task_loss": 0.9176337718963623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42400234937667847, + "epoch": 5.56, + "learning_rate": 2.2176669484361794e-05, + "loss": 0.4589, + "step": 6583, + "task_loss": 0.7746252417564392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2727625370025635, + "epoch": 5.57, + "learning_rate": 2.2172442941673713e-05, + "loss": 0.3334, + "step": 6584, + "task_loss": 0.03718046471476555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3818608820438385, + "epoch": 5.57, + "learning_rate": 2.216821639898563e-05, + "loss": 0.3817, + "step": 6585, + "task_loss": 1.5395042896270752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3122103810310364, + "epoch": 5.57, + "learning_rate": 2.216398985629755e-05, + "loss": 0.3716, + "step": 6586, + "task_loss": 0.4920465350151062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18556299805641174, + "epoch": 5.57, + "learning_rate": 2.215976331360947e-05, + "loss": 0.3941, + "step": 6587, + "task_loss": 0.07084954530000687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39843857288360596, + "epoch": 5.57, + "learning_rate": 2.2155536770921386e-05, + "loss": 0.393, + "step": 6588, + "task_loss": 0.4581957757472992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33707818388938904, + "epoch": 5.57, + "learning_rate": 2.215131022823331e-05, + "loss": 0.491, + "step": 6589, + "task_loss": 0.5638668537139893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5662754774093628, + "epoch": 5.57, + "learning_rate": 2.2147083685545225e-05, + "loss": 0.4797, + "step": 6590, + "task_loss": 0.8410687446594238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45930880308151245, + "epoch": 5.57, + "learning_rate": 2.214285714285714e-05, + "loss": 0.5129, + "step": 6591, + "task_loss": 0.4605422616004944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5784786939620972, + "epoch": 5.57, + "learning_rate": 2.2138630600169065e-05, + "loss": 0.467, + "step": 6592, + "task_loss": 0.5142861008644104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5326275825500488, + "epoch": 5.57, + "learning_rate": 2.213440405748098e-05, + "loss": 0.4969, + "step": 6593, + "task_loss": 0.8516028523445129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3323522210121155, + "epoch": 5.57, + "learning_rate": 2.21301775147929e-05, + "loss": 0.4285, + "step": 6594, + "task_loss": 0.49355900287628174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45413345098495483, + "epoch": 5.57, + "learning_rate": 2.212595097210482e-05, + "loss": 0.5078, + "step": 6595, + "task_loss": 0.13283292949199677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47557783126831055, + "epoch": 5.58, + "learning_rate": 2.2121724429416737e-05, + "loss": 0.5227, + "step": 6596, + "task_loss": 0.8164372444152832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21432280540466309, + "epoch": 5.58, + "learning_rate": 2.2117497886728656e-05, + "loss": 0.4463, + "step": 6597, + "task_loss": 0.3244723379611969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3982103765010834, + "epoch": 5.58, + "learning_rate": 2.2113271344040576e-05, + "loss": 0.488, + "step": 6598, + "task_loss": 0.7477995157241821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3384738266468048, + "epoch": 5.58, + "learning_rate": 2.2109044801352493e-05, + "loss": 0.3274, + "step": 6599, + "task_loss": 0.23857639729976654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6250789165496826, + "epoch": 5.58, + "learning_rate": 2.2104818258664416e-05, + "loss": 0.423, + "step": 6600, + "task_loss": 0.3815581500530243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7946285009384155, + "epoch": 5.58, + "learning_rate": 2.2100591715976332e-05, + "loss": 0.6503, + "step": 6601, + "task_loss": 1.7416517734527588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3971802294254303, + "epoch": 5.58, + "learning_rate": 2.2096365173288252e-05, + "loss": 0.4016, + "step": 6602, + "task_loss": 0.41994473338127136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6118603944778442, + "epoch": 5.58, + "learning_rate": 2.209213863060017e-05, + "loss": 0.5329, + "step": 6603, + "task_loss": 1.5953171253204346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2875477075576782, + "epoch": 5.58, + "learning_rate": 2.2087912087912088e-05, + "loss": 0.3788, + "step": 6604, + "task_loss": 0.3744802176952362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4883019030094147, + "epoch": 5.58, + "learning_rate": 2.2083685545224008e-05, + "loss": 0.5034, + "step": 6605, + "task_loss": 1.2311573028564453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2491014003753662, + "epoch": 5.58, + "learning_rate": 2.2079459002535927e-05, + "loss": 0.4192, + "step": 6606, + "task_loss": 0.23697152733802795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.314663827419281, + "epoch": 5.58, + "learning_rate": 2.2075232459847844e-05, + "loss": 0.3892, + "step": 6607, + "task_loss": 0.34976285696029663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2968905568122864, + "epoch": 5.59, + "learning_rate": 2.2071005917159763e-05, + "loss": 0.4612, + "step": 6608, + "task_loss": 0.2096678763628006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4578056335449219, + "epoch": 5.59, + "learning_rate": 2.2066779374471683e-05, + "loss": 0.5863, + "step": 6609, + "task_loss": 1.4398854970932007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8692323565483093, + "epoch": 5.59, + "learning_rate": 2.2062552831783603e-05, + "loss": 0.599, + "step": 6610, + "task_loss": 1.1830549240112305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2681110203266144, + "epoch": 5.59, + "learning_rate": 2.2058326289095523e-05, + "loss": 0.4221, + "step": 6611, + "task_loss": 1.8927868604660034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5832124352455139, + "epoch": 5.59, + "learning_rate": 2.205409974640744e-05, + "loss": 0.4791, + "step": 6612, + "task_loss": 0.3963264524936676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20971204340457916, + "epoch": 5.59, + "learning_rate": 2.204987320371936e-05, + "loss": 0.3889, + "step": 6613, + "task_loss": 0.5461903214454651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36383718252182007, + "epoch": 5.59, + "learning_rate": 2.204564666103128e-05, + "loss": 0.4383, + "step": 6614, + "task_loss": 0.39409035444259644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.464749813079834, + "epoch": 5.59, + "learning_rate": 2.2041420118343195e-05, + "loss": 0.4723, + "step": 6615, + "task_loss": 1.0040212869644165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4018401503562927, + "epoch": 5.59, + "learning_rate": 2.2037193575655115e-05, + "loss": 0.3811, + "step": 6616, + "task_loss": 0.9969697594642639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2552361488342285, + "epoch": 5.59, + "learning_rate": 2.2032967032967034e-05, + "loss": 0.3915, + "step": 6617, + "task_loss": 0.28578609228134155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2928164601325989, + "epoch": 5.59, + "learning_rate": 2.2028740490278954e-05, + "loss": 0.3754, + "step": 6618, + "task_loss": 0.14997069537639618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.363754540681839, + "epoch": 5.59, + "learning_rate": 2.202451394759087e-05, + "loss": 0.3904, + "step": 6619, + "task_loss": 1.4418221712112427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28526636958122253, + "epoch": 5.6, + "learning_rate": 2.202028740490279e-05, + "loss": 0.3534, + "step": 6620, + "task_loss": 0.6053158640861511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41791990399360657, + "epoch": 5.6, + "learning_rate": 2.201606086221471e-05, + "loss": 0.493, + "step": 6621, + "task_loss": 0.4252471923828125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5734915733337402, + "epoch": 5.6, + "learning_rate": 2.201183431952663e-05, + "loss": 0.5535, + "step": 6622, + "task_loss": 1.1274453401565552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7606749534606934, + "epoch": 5.6, + "learning_rate": 2.2007607776838546e-05, + "loss": 0.5223, + "step": 6623, + "task_loss": 1.050026297569275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7250608205795288, + "epoch": 5.6, + "learning_rate": 2.2003381234150466e-05, + "loss": 0.6297, + "step": 6624, + "task_loss": 1.1712957620620728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19725146889686584, + "epoch": 5.6, + "learning_rate": 2.1999154691462385e-05, + "loss": 0.5004, + "step": 6625, + "task_loss": 0.015571881085634232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3152746260166168, + "epoch": 5.6, + "learning_rate": 2.1994928148774305e-05, + "loss": 0.418, + "step": 6626, + "task_loss": 0.4012804627418518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5508754849433899, + "epoch": 5.6, + "learning_rate": 2.199070160608622e-05, + "loss": 0.4614, + "step": 6627, + "task_loss": 1.1642802953720093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23977942764759064, + "epoch": 5.6, + "learning_rate": 2.198647506339814e-05, + "loss": 0.4739, + "step": 6628, + "task_loss": 0.2495940923690796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9456596970558167, + "epoch": 5.6, + "learning_rate": 2.198224852071006e-05, + "loss": 0.5896, + "step": 6629, + "task_loss": 0.8423975110054016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2918052077293396, + "epoch": 5.6, + "learning_rate": 2.1978021978021977e-05, + "loss": 0.3528, + "step": 6630, + "task_loss": 0.3776776194572449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28707656264305115, + "epoch": 5.6, + "learning_rate": 2.19737954353339e-05, + "loss": 0.4453, + "step": 6631, + "task_loss": 1.297120213508606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5698080658912659, + "epoch": 5.61, + "learning_rate": 2.1969568892645817e-05, + "loss": 0.4705, + "step": 6632, + "task_loss": 0.23067864775657654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22129428386688232, + "epoch": 5.61, + "learning_rate": 2.1965342349957737e-05, + "loss": 0.6486, + "step": 6633, + "task_loss": 0.6887525320053101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3690264821052551, + "epoch": 5.61, + "learning_rate": 2.1961115807269656e-05, + "loss": 0.5817, + "step": 6634, + "task_loss": 0.44805046916007996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3376394212245941, + "epoch": 5.61, + "learning_rate": 2.1956889264581573e-05, + "loss": 0.4408, + "step": 6635, + "task_loss": 0.1119861900806427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7915014028549194, + "epoch": 5.61, + "learning_rate": 2.1952662721893492e-05, + "loss": 0.7133, + "step": 6636, + "task_loss": 0.950564444065094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3085026443004608, + "epoch": 5.61, + "learning_rate": 2.1948436179205412e-05, + "loss": 0.3289, + "step": 6637, + "task_loss": 0.2138867974281311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3445605933666229, + "epoch": 5.61, + "learning_rate": 2.194420963651733e-05, + "loss": 0.461, + "step": 6638, + "task_loss": 0.5891308188438416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29952695965766907, + "epoch": 5.61, + "learning_rate": 2.193998309382925e-05, + "loss": 0.4931, + "step": 6639, + "task_loss": 0.08761756122112274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3112938404083252, + "epoch": 5.61, + "learning_rate": 2.1935756551141168e-05, + "loss": 0.4814, + "step": 6640, + "task_loss": 0.38097134232521057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43494877219200134, + "epoch": 5.61, + "learning_rate": 2.1931530008453084e-05, + "loss": 0.5909, + "step": 6641, + "task_loss": 0.4314068555831909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21665450930595398, + "epoch": 5.61, + "learning_rate": 2.1927303465765007e-05, + "loss": 0.5129, + "step": 6642, + "task_loss": 0.19272123277187347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6129412055015564, + "epoch": 5.61, + "learning_rate": 2.1923076923076924e-05, + "loss": 0.524, + "step": 6643, + "task_loss": 0.5507757067680359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4304733872413635, + "epoch": 5.62, + "learning_rate": 2.191885038038884e-05, + "loss": 0.4757, + "step": 6644, + "task_loss": 0.08917589485645294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7696310877799988, + "epoch": 5.62, + "learning_rate": 2.1914623837700763e-05, + "loss": 0.6087, + "step": 6645, + "task_loss": 0.7796250581741333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2461678385734558, + "epoch": 5.62, + "learning_rate": 2.191039729501268e-05, + "loss": 0.3226, + "step": 6646, + "task_loss": 0.26112043857574463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45307403802871704, + "epoch": 5.62, + "learning_rate": 2.19061707523246e-05, + "loss": 0.4588, + "step": 6647, + "task_loss": 1.579825758934021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3606126606464386, + "epoch": 5.62, + "learning_rate": 2.190194420963652e-05, + "loss": 0.3957, + "step": 6648, + "task_loss": 0.1721055656671524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5496014952659607, + "epoch": 5.62, + "learning_rate": 2.1897717666948435e-05, + "loss": 0.3768, + "step": 6649, + "task_loss": 0.41457489132881165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29964929819107056, + "epoch": 5.62, + "learning_rate": 2.189349112426036e-05, + "loss": 0.4423, + "step": 6650, + "task_loss": 0.5744560360908508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5012792348861694, + "epoch": 5.62, + "learning_rate": 2.1889264581572275e-05, + "loss": 0.4371, + "step": 6651, + "task_loss": 0.7494077682495117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33123207092285156, + "epoch": 5.62, + "learning_rate": 2.188503803888419e-05, + "loss": 0.37, + "step": 6652, + "task_loss": 0.6639485359191895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3170870244503021, + "epoch": 5.62, + "learning_rate": 2.1880811496196114e-05, + "loss": 0.5049, + "step": 6653, + "task_loss": 0.918781042098999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28564170002937317, + "epoch": 5.62, + "learning_rate": 2.187658495350803e-05, + "loss": 0.3822, + "step": 6654, + "task_loss": 0.5689250826835632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5809497833251953, + "epoch": 5.63, + "learning_rate": 2.187235841081995e-05, + "loss": 0.5303, + "step": 6655, + "task_loss": 1.0988904237747192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49017542600631714, + "epoch": 5.63, + "learning_rate": 2.186813186813187e-05, + "loss": 0.4152, + "step": 6656, + "task_loss": 1.0502938032150269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43486344814300537, + "epoch": 5.63, + "learning_rate": 2.1863905325443787e-05, + "loss": 0.6119, + "step": 6657, + "task_loss": 0.5644630193710327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44015204906463623, + "epoch": 5.63, + "learning_rate": 2.1859678782755706e-05, + "loss": 0.4648, + "step": 6658, + "task_loss": 0.3661627769470215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2525050640106201, + "epoch": 5.63, + "learning_rate": 2.1855452240067626e-05, + "loss": 0.384, + "step": 6659, + "task_loss": 0.28871649503707886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46284550428390503, + "epoch": 5.63, + "learning_rate": 2.1851225697379546e-05, + "loss": 0.4914, + "step": 6660, + "task_loss": 1.477583646774292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3007497191429138, + "epoch": 5.63, + "learning_rate": 2.1846999154691462e-05, + "loss": 0.3943, + "step": 6661, + "task_loss": 0.6044033765792847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.547548770904541, + "epoch": 5.63, + "learning_rate": 2.1842772612003382e-05, + "loss": 0.5052, + "step": 6662, + "task_loss": 1.746382713317871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43696457147598267, + "epoch": 5.63, + "learning_rate": 2.18385460693153e-05, + "loss": 0.359, + "step": 6663, + "task_loss": 0.7883840799331665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3060758709907532, + "epoch": 5.63, + "learning_rate": 2.183431952662722e-05, + "loss": 0.3919, + "step": 6664, + "task_loss": 1.4962902069091797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.620410680770874, + "epoch": 5.63, + "learning_rate": 2.1830092983939138e-05, + "loss": 0.6356, + "step": 6665, + "task_loss": 0.6708673238754272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22259894013404846, + "epoch": 5.63, + "learning_rate": 2.1825866441251057e-05, + "loss": 0.3349, + "step": 6666, + "task_loss": 0.40206316113471985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.514568567276001, + "epoch": 5.64, + "learning_rate": 2.1821639898562977e-05, + "loss": 0.4667, + "step": 6667, + "task_loss": 0.8175463676452637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.385628879070282, + "epoch": 5.64, + "learning_rate": 2.1817413355874897e-05, + "loss": 0.3695, + "step": 6668, + "task_loss": 0.24502016603946686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26250022649765015, + "epoch": 5.64, + "learning_rate": 2.1813186813186813e-05, + "loss": 0.3639, + "step": 6669, + "task_loss": 0.37060174345970154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44420403242111206, + "epoch": 5.64, + "learning_rate": 2.1808960270498733e-05, + "loss": 0.5238, + "step": 6670, + "task_loss": 1.528491735458374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3467496335506439, + "epoch": 5.64, + "learning_rate": 2.1804733727810653e-05, + "loss": 0.5599, + "step": 6671, + "task_loss": 0.5854769945144653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5276918411254883, + "epoch": 5.64, + "learning_rate": 2.180050718512257e-05, + "loss": 0.4836, + "step": 6672, + "task_loss": 0.46222230792045593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7202828526496887, + "epoch": 5.64, + "learning_rate": 2.179628064243449e-05, + "loss": 0.5274, + "step": 6673, + "task_loss": 1.1488972902297974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4792698621749878, + "epoch": 5.64, + "learning_rate": 2.179205409974641e-05, + "loss": 0.5381, + "step": 6674, + "task_loss": 1.0096802711486816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4209651052951813, + "epoch": 5.64, + "learning_rate": 2.1787827557058328e-05, + "loss": 0.389, + "step": 6675, + "task_loss": 0.3032013773918152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5430549383163452, + "epoch": 5.64, + "learning_rate": 2.1783601014370248e-05, + "loss": 0.4336, + "step": 6676, + "task_loss": 0.7982643246650696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4977704882621765, + "epoch": 5.64, + "learning_rate": 2.1779374471682164e-05, + "loss": 0.5614, + "step": 6677, + "task_loss": 1.145737648010254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3327661454677582, + "epoch": 5.64, + "learning_rate": 2.1775147928994084e-05, + "loss": 0.4002, + "step": 6678, + "task_loss": 0.2778187692165375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24384522438049316, + "epoch": 5.65, + "learning_rate": 2.1770921386306004e-05, + "loss": 0.3485, + "step": 6679, + "task_loss": 0.1743202954530716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35648736357688904, + "epoch": 5.65, + "learning_rate": 2.176669484361792e-05, + "loss": 0.4728, + "step": 6680, + "task_loss": 0.9142789244651794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44687050580978394, + "epoch": 5.65, + "learning_rate": 2.176246830092984e-05, + "loss": 0.3833, + "step": 6681, + "task_loss": 0.18046848475933075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3163347840309143, + "epoch": 5.65, + "learning_rate": 2.175824175824176e-05, + "loss": 0.472, + "step": 6682, + "task_loss": 0.6492880582809448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5634894967079163, + "epoch": 5.65, + "learning_rate": 2.1754015215553676e-05, + "loss": 0.5677, + "step": 6683, + "task_loss": 0.5417191386222839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3187306821346283, + "epoch": 5.65, + "learning_rate": 2.17497886728656e-05, + "loss": 0.4391, + "step": 6684, + "task_loss": 0.3030279576778412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5657342672348022, + "epoch": 5.65, + "learning_rate": 2.1745562130177516e-05, + "loss": 0.447, + "step": 6685, + "task_loss": 1.1252855062484741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.525389552116394, + "epoch": 5.65, + "learning_rate": 2.1741335587489435e-05, + "loss": 0.5207, + "step": 6686, + "task_loss": 0.9672444462776184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30847275257110596, + "epoch": 5.65, + "learning_rate": 2.1737109044801355e-05, + "loss": 0.3749, + "step": 6687, + "task_loss": 0.31510409712791443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5977796316146851, + "epoch": 5.65, + "learning_rate": 2.173288250211327e-05, + "loss": 0.4986, + "step": 6688, + "task_loss": 0.9500839114189148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3775804042816162, + "epoch": 5.65, + "learning_rate": 2.172865595942519e-05, + "loss": 0.4134, + "step": 6689, + "task_loss": 0.7767316699028015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2688702940940857, + "epoch": 5.65, + "learning_rate": 2.172442941673711e-05, + "loss": 0.3116, + "step": 6690, + "task_loss": 0.45763349533081055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4715948700904846, + "epoch": 5.66, + "learning_rate": 2.1720202874049027e-05, + "loss": 0.6495, + "step": 6691, + "task_loss": 1.0293368101119995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5192683935165405, + "epoch": 5.66, + "learning_rate": 2.171597633136095e-05, + "loss": 0.4939, + "step": 6692, + "task_loss": 1.4563164710998535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3596770465373993, + "epoch": 5.66, + "learning_rate": 2.1711749788672867e-05, + "loss": 0.4439, + "step": 6693, + "task_loss": 0.3766735792160034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3958856463432312, + "epoch": 5.66, + "learning_rate": 2.1707523245984783e-05, + "loss": 0.4512, + "step": 6694, + "task_loss": 0.9399265646934509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46372485160827637, + "epoch": 5.66, + "learning_rate": 2.1703296703296706e-05, + "loss": 0.6369, + "step": 6695, + "task_loss": 1.3706023693084717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5102686285972595, + "epoch": 5.66, + "learning_rate": 2.1699070160608622e-05, + "loss": 0.3982, + "step": 6696, + "task_loss": 0.4204097092151642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5004092454910278, + "epoch": 5.66, + "learning_rate": 2.1694843617920542e-05, + "loss": 0.496, + "step": 6697, + "task_loss": 0.7308244109153748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31620413064956665, + "epoch": 5.66, + "learning_rate": 2.1690617075232462e-05, + "loss": 0.4924, + "step": 6698, + "task_loss": 0.5250855684280396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29264551401138306, + "epoch": 5.66, + "learning_rate": 2.1686390532544378e-05, + "loss": 0.3504, + "step": 6699, + "task_loss": 0.5557460784912109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4742107391357422, + "epoch": 5.66, + "learning_rate": 2.1682163989856298e-05, + "loss": 0.5024, + "step": 6700, + "task_loss": 1.3855253458023071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5794758200645447, + "epoch": 5.66, + "learning_rate": 2.1677937447168218e-05, + "loss": 0.4461, + "step": 6701, + "task_loss": 0.6162325143814087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5781143307685852, + "epoch": 5.66, + "learning_rate": 2.1673710904480134e-05, + "loss": 0.4309, + "step": 6702, + "task_loss": 0.8657614588737488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2678454518318176, + "epoch": 5.67, + "learning_rate": 2.1669484361792057e-05, + "loss": 0.4695, + "step": 6703, + "task_loss": 0.5560580492019653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46910807490348816, + "epoch": 5.67, + "learning_rate": 2.1665257819103974e-05, + "loss": 0.4655, + "step": 6704, + "task_loss": 1.2586930990219116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5803413987159729, + "epoch": 5.67, + "learning_rate": 2.1661031276415893e-05, + "loss": 0.5281, + "step": 6705, + "task_loss": 1.010599970817566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7397748827934265, + "epoch": 5.67, + "learning_rate": 2.1656804733727813e-05, + "loss": 0.481, + "step": 6706, + "task_loss": 0.1796511709690094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7339752316474915, + "epoch": 5.67, + "learning_rate": 2.165257819103973e-05, + "loss": 0.5426, + "step": 6707, + "task_loss": 1.0069605112075806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5188617706298828, + "epoch": 5.67, + "learning_rate": 2.164835164835165e-05, + "loss": 0.5592, + "step": 6708, + "task_loss": 1.3485506772994995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3367837071418762, + "epoch": 5.67, + "learning_rate": 2.164412510566357e-05, + "loss": 0.3851, + "step": 6709, + "task_loss": 0.1405501812696457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40066462755203247, + "epoch": 5.67, + "learning_rate": 2.1639898562975485e-05, + "loss": 0.5281, + "step": 6710, + "task_loss": 0.7205643653869629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6344140768051147, + "epoch": 5.67, + "learning_rate": 2.1635672020287405e-05, + "loss": 0.4494, + "step": 6711, + "task_loss": 0.378339558839798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.298669695854187, + "epoch": 5.67, + "learning_rate": 2.1631445477599325e-05, + "loss": 0.4326, + "step": 6712, + "task_loss": 0.6739474534988403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22347085177898407, + "epoch": 5.67, + "learning_rate": 2.1627218934911244e-05, + "loss": 0.486, + "step": 6713, + "task_loss": 0.29230520129203796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31795555353164673, + "epoch": 5.67, + "learning_rate": 2.1622992392223164e-05, + "loss": 0.3697, + "step": 6714, + "task_loss": 0.36035627126693726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4322434663772583, + "epoch": 5.68, + "learning_rate": 2.161876584953508e-05, + "loss": 0.4574, + "step": 6715, + "task_loss": 1.0636188983917236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41897836327552795, + "epoch": 5.68, + "learning_rate": 2.1614539306847e-05, + "loss": 0.3685, + "step": 6716, + "task_loss": 0.5359217524528503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.256425678730011, + "epoch": 5.68, + "learning_rate": 2.161031276415892e-05, + "loss": 0.4938, + "step": 6717, + "task_loss": 0.9868853092193604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4227105975151062, + "epoch": 5.68, + "learning_rate": 2.160608622147084e-05, + "loss": 0.4819, + "step": 6718, + "task_loss": 1.48383367061615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2603036165237427, + "epoch": 5.68, + "learning_rate": 2.1601859678782756e-05, + "loss": 0.471, + "step": 6719, + "task_loss": 0.04105174541473389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6495502591133118, + "epoch": 5.68, + "learning_rate": 2.1597633136094676e-05, + "loss": 0.5038, + "step": 6720, + "task_loss": 1.2899093627929688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6795089840888977, + "epoch": 5.68, + "learning_rate": 2.1593406593406596e-05, + "loss": 0.4908, + "step": 6721, + "task_loss": 0.6083970665931702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2990046739578247, + "epoch": 5.68, + "learning_rate": 2.1589180050718512e-05, + "loss": 0.5593, + "step": 6722, + "task_loss": 0.30463266372680664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7162476778030396, + "epoch": 5.68, + "learning_rate": 2.1584953508030432e-05, + "loss": 0.634, + "step": 6723, + "task_loss": 1.206944465637207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45765915513038635, + "epoch": 5.68, + "learning_rate": 2.158072696534235e-05, + "loss": 0.5834, + "step": 6724, + "task_loss": 0.6975574493408203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4896770417690277, + "epoch": 5.68, + "learning_rate": 2.1576500422654268e-05, + "loss": 0.5133, + "step": 6725, + "task_loss": 0.5666850209236145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.51052325963974, + "epoch": 5.69, + "learning_rate": 2.157227387996619e-05, + "loss": 0.534, + "step": 6726, + "task_loss": 0.6694901585578918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7746536731719971, + "epoch": 5.69, + "learning_rate": 2.1568047337278107e-05, + "loss": 0.5707, + "step": 6727, + "task_loss": 0.9090960025787354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5497757196426392, + "epoch": 5.69, + "learning_rate": 2.1563820794590027e-05, + "loss": 0.5242, + "step": 6728, + "task_loss": 0.44954121112823486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4458220899105072, + "epoch": 5.69, + "learning_rate": 2.1559594251901947e-05, + "loss": 0.5551, + "step": 6729, + "task_loss": 0.3808966279029846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5431913733482361, + "epoch": 5.69, + "learning_rate": 2.1555367709213863e-05, + "loss": 0.3506, + "step": 6730, + "task_loss": 0.5021058320999146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4136313199996948, + "epoch": 5.69, + "learning_rate": 2.1551141166525783e-05, + "loss": 0.5544, + "step": 6731, + "task_loss": 1.0348055362701416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3025673031806946, + "epoch": 5.69, + "learning_rate": 2.1546914623837703e-05, + "loss": 0.4732, + "step": 6732, + "task_loss": 0.940022349357605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3571837544441223, + "epoch": 5.69, + "learning_rate": 2.154268808114962e-05, + "loss": 0.4813, + "step": 6733, + "task_loss": 1.0127058029174805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.488209068775177, + "epoch": 5.69, + "learning_rate": 2.1538461538461542e-05, + "loss": 0.4574, + "step": 6734, + "task_loss": 0.7513001561164856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5237274169921875, + "epoch": 5.69, + "learning_rate": 2.153423499577346e-05, + "loss": 0.4568, + "step": 6735, + "task_loss": 0.37515881657600403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3149186074733734, + "epoch": 5.69, + "learning_rate": 2.1530008453085375e-05, + "loss": 0.6078, + "step": 6736, + "task_loss": 0.5907328128814697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4140559434890747, + "epoch": 5.69, + "learning_rate": 2.1525781910397298e-05, + "loss": 0.4045, + "step": 6737, + "task_loss": 1.326681137084961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30922675132751465, + "epoch": 5.7, + "learning_rate": 2.1521555367709214e-05, + "loss": 0.5436, + "step": 6738, + "task_loss": 1.0444397926330566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38426321744918823, + "epoch": 5.7, + "learning_rate": 2.1517328825021134e-05, + "loss": 0.3692, + "step": 6739, + "task_loss": 0.2168019860982895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48237109184265137, + "epoch": 5.7, + "learning_rate": 2.1513102282333054e-05, + "loss": 0.4087, + "step": 6740, + "task_loss": 0.8307868242263794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5990522503852844, + "epoch": 5.7, + "learning_rate": 2.150887573964497e-05, + "loss": 0.4345, + "step": 6741, + "task_loss": 0.8067715167999268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36341625452041626, + "epoch": 5.7, + "learning_rate": 2.150464919695689e-05, + "loss": 0.5185, + "step": 6742, + "task_loss": 0.6057892441749573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3511134386062622, + "epoch": 5.7, + "learning_rate": 2.150042265426881e-05, + "loss": 0.4416, + "step": 6743, + "task_loss": 0.8138015270233154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3959570527076721, + "epoch": 5.7, + "learning_rate": 2.1496196111580726e-05, + "loss": 0.3349, + "step": 6744, + "task_loss": 0.4190218448638916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4763460159301758, + "epoch": 5.7, + "learning_rate": 2.149196956889265e-05, + "loss": 0.4539, + "step": 6745, + "task_loss": 0.4817401170730591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28733617067337036, + "epoch": 5.7, + "learning_rate": 2.1487743026204565e-05, + "loss": 0.3561, + "step": 6746, + "task_loss": 0.46150100231170654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43958157300949097, + "epoch": 5.7, + "learning_rate": 2.1483516483516482e-05, + "loss": 0.4914, + "step": 6747, + "task_loss": 0.5022943019866943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4857657551765442, + "epoch": 5.7, + "learning_rate": 2.1479289940828405e-05, + "loss": 0.5228, + "step": 6748, + "task_loss": 0.24779477715492249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5735419988632202, + "epoch": 5.7, + "learning_rate": 2.147506339814032e-05, + "loss": 0.4484, + "step": 6749, + "task_loss": 0.8466213345527649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4814309775829315, + "epoch": 5.71, + "learning_rate": 2.147083685545224e-05, + "loss": 0.3724, + "step": 6750, + "task_loss": 0.41916024684906006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3465757966041565, + "epoch": 5.71, + "learning_rate": 2.146661031276416e-05, + "loss": 0.4241, + "step": 6751, + "task_loss": 0.44277143478393555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3248085379600525, + "epoch": 5.71, + "learning_rate": 2.1462383770076077e-05, + "loss": 0.3383, + "step": 6752, + "task_loss": 1.2061140537261963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46878859400749207, + "epoch": 5.71, + "learning_rate": 2.1458157227387997e-05, + "loss": 0.4509, + "step": 6753, + "task_loss": 0.6678359508514404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4628848135471344, + "epoch": 5.71, + "learning_rate": 2.1453930684699916e-05, + "loss": 0.4079, + "step": 6754, + "task_loss": 1.001604676246643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3515365421772003, + "epoch": 5.71, + "learning_rate": 2.1449704142011836e-05, + "loss": 0.4458, + "step": 6755, + "task_loss": 0.6357600092887878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5633479952812195, + "epoch": 5.71, + "learning_rate": 2.1445477599323756e-05, + "loss": 0.5436, + "step": 6756, + "task_loss": 0.24646206200122833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7011735439300537, + "epoch": 5.71, + "learning_rate": 2.1441251056635672e-05, + "loss": 0.5626, + "step": 6757, + "task_loss": 1.4723469018936157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4954965114593506, + "epoch": 5.71, + "learning_rate": 2.1437024513947592e-05, + "loss": 0.4289, + "step": 6758, + "task_loss": 0.8803597688674927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41444939374923706, + "epoch": 5.71, + "learning_rate": 2.1432797971259512e-05, + "loss": 0.5857, + "step": 6759, + "task_loss": 0.8552736639976501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48859903216362, + "epoch": 5.71, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.5045, + "step": 6760, + "task_loss": 0.5711913108825684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6026777029037476, + "epoch": 5.71, + "learning_rate": 2.1424344885883348e-05, + "loss": 0.5605, + "step": 6761, + "task_loss": 0.3474797308444977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3489863872528076, + "epoch": 5.72, + "learning_rate": 2.1420118343195268e-05, + "loss": 0.3644, + "step": 6762, + "task_loss": 0.7322254180908203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5694568753242493, + "epoch": 5.72, + "learning_rate": 2.1415891800507187e-05, + "loss": 0.4569, + "step": 6763, + "task_loss": 1.4226588010787964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3010168969631195, + "epoch": 5.72, + "learning_rate": 2.1411665257819104e-05, + "loss": 0.4259, + "step": 6764, + "task_loss": 0.6803930401802063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2720373272895813, + "epoch": 5.72, + "learning_rate": 2.1407438715131023e-05, + "loss": 0.4787, + "step": 6765, + "task_loss": 0.19917823374271393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6226383447647095, + "epoch": 5.72, + "learning_rate": 2.1403212172442943e-05, + "loss": 0.4914, + "step": 6766, + "task_loss": 1.3299155235290527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3875640034675598, + "epoch": 5.72, + "learning_rate": 2.1398985629754863e-05, + "loss": 0.4624, + "step": 6767, + "task_loss": 0.22936205565929413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25801533460617065, + "epoch": 5.72, + "learning_rate": 2.139475908706678e-05, + "loss": 0.4108, + "step": 6768, + "task_loss": 1.0187067985534668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5304945111274719, + "epoch": 5.72, + "learning_rate": 2.13905325443787e-05, + "loss": 0.4396, + "step": 6769, + "task_loss": 0.7897787690162659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3565604090690613, + "epoch": 5.72, + "learning_rate": 2.138630600169062e-05, + "loss": 0.415, + "step": 6770, + "task_loss": 0.25789934396743774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32732632756233215, + "epoch": 5.72, + "learning_rate": 2.138207945900254e-05, + "loss": 0.4699, + "step": 6771, + "task_loss": 0.6669847369194031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6157450079917908, + "epoch": 5.72, + "learning_rate": 2.1377852916314455e-05, + "loss": 0.4144, + "step": 6772, + "task_loss": 0.7623788714408875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36396968364715576, + "epoch": 5.72, + "learning_rate": 2.1373626373626375e-05, + "loss": 0.4009, + "step": 6773, + "task_loss": 0.3830263912677765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48826244473457336, + "epoch": 5.73, + "learning_rate": 2.1369399830938294e-05, + "loss": 0.5586, + "step": 6774, + "task_loss": 0.3352643847465515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3475797176361084, + "epoch": 5.73, + "learning_rate": 2.136517328825021e-05, + "loss": 0.4314, + "step": 6775, + "task_loss": 0.15374507009983063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5560595989227295, + "epoch": 5.73, + "learning_rate": 2.136094674556213e-05, + "loss": 0.4307, + "step": 6776, + "task_loss": 0.48674294352531433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34720557928085327, + "epoch": 5.73, + "learning_rate": 2.135672020287405e-05, + "loss": 0.4748, + "step": 6777, + "task_loss": 0.4600304067134857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2592812180519104, + "epoch": 5.73, + "learning_rate": 2.135249366018597e-05, + "loss": 0.3788, + "step": 6778, + "task_loss": 0.42032235860824585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4969314634799957, + "epoch": 5.73, + "learning_rate": 2.134826711749789e-05, + "loss": 0.4663, + "step": 6779, + "task_loss": 0.9224324226379395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39084574580192566, + "epoch": 5.73, + "learning_rate": 2.1344040574809806e-05, + "loss": 0.5807, + "step": 6780, + "task_loss": 0.17408879101276398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6009945869445801, + "epoch": 5.73, + "learning_rate": 2.1339814032121726e-05, + "loss": 0.5261, + "step": 6781, + "task_loss": 1.31087064743042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19805112481117249, + "epoch": 5.73, + "learning_rate": 2.1335587489433645e-05, + "loss": 0.4153, + "step": 6782, + "task_loss": 0.5363072156906128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19592398405075073, + "epoch": 5.73, + "learning_rate": 2.1331360946745562e-05, + "loss": 0.4602, + "step": 6783, + "task_loss": 0.7822631001472473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5393253564834595, + "epoch": 5.73, + "learning_rate": 2.1327134404057485e-05, + "loss": 0.4271, + "step": 6784, + "task_loss": 0.3242488503456116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33584100008010864, + "epoch": 5.73, + "learning_rate": 2.13229078613694e-05, + "loss": 0.3779, + "step": 6785, + "task_loss": 0.40616145730018616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4789864718914032, + "epoch": 5.74, + "learning_rate": 2.1318681318681318e-05, + "loss": 0.4589, + "step": 6786, + "task_loss": 0.8307963013648987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48436540365219116, + "epoch": 5.74, + "learning_rate": 2.131445477599324e-05, + "loss": 0.6355, + "step": 6787, + "task_loss": 0.4870648980140686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6092596054077148, + "epoch": 5.74, + "learning_rate": 2.1310228233305157e-05, + "loss": 0.5109, + "step": 6788, + "task_loss": 0.5954377055168152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2594757080078125, + "epoch": 5.74, + "learning_rate": 2.1306001690617073e-05, + "loss": 0.5333, + "step": 6789, + "task_loss": 0.15603889524936676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.525944709777832, + "epoch": 5.74, + "learning_rate": 2.1301775147928997e-05, + "loss": 0.5111, + "step": 6790, + "task_loss": 0.35330891609191895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40124428272247314, + "epoch": 5.74, + "learning_rate": 2.1297548605240913e-05, + "loss": 0.5026, + "step": 6791, + "task_loss": 0.849157452583313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4003504514694214, + "epoch": 5.74, + "learning_rate": 2.1293322062552833e-05, + "loss": 0.683, + "step": 6792, + "task_loss": 0.7268747091293335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42773720622062683, + "epoch": 5.74, + "learning_rate": 2.1289095519864752e-05, + "loss": 0.4814, + "step": 6793, + "task_loss": 0.32103240489959717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5112349390983582, + "epoch": 5.74, + "learning_rate": 2.128486897717667e-05, + "loss": 0.4346, + "step": 6794, + "task_loss": 0.5772743225097656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5459302067756653, + "epoch": 5.74, + "learning_rate": 2.128064243448859e-05, + "loss": 0.4874, + "step": 6795, + "task_loss": 1.653991460800171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40330782532691956, + "epoch": 5.74, + "learning_rate": 2.1276415891800508e-05, + "loss": 0.4902, + "step": 6796, + "task_loss": 0.4302142262458801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3176397383213043, + "epoch": 5.75, + "learning_rate": 2.1272189349112425e-05, + "loss": 0.396, + "step": 6797, + "task_loss": 0.28809553384780884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6232177019119263, + "epoch": 5.75, + "learning_rate": 2.1267962806424348e-05, + "loss": 0.5835, + "step": 6798, + "task_loss": 0.3648402988910675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7750144004821777, + "epoch": 5.75, + "learning_rate": 2.1263736263736264e-05, + "loss": 0.5831, + "step": 6799, + "task_loss": 0.5778810977935791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22234627604484558, + "epoch": 5.75, + "learning_rate": 2.1259509721048184e-05, + "loss": 0.4266, + "step": 6800, + "task_loss": 0.44662272930145264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6292132139205933, + "epoch": 5.75, + "learning_rate": 2.1255283178360104e-05, + "loss": 0.532, + "step": 6801, + "task_loss": 1.0702412128448486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9979573488235474, + "epoch": 5.75, + "learning_rate": 2.125105663567202e-05, + "loss": 0.4921, + "step": 6802, + "task_loss": 0.6450396776199341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31618595123291016, + "epoch": 5.75, + "learning_rate": 2.124683009298394e-05, + "loss": 0.4398, + "step": 6803, + "task_loss": 0.39285609126091003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39941421151161194, + "epoch": 5.75, + "learning_rate": 2.124260355029586e-05, + "loss": 0.3983, + "step": 6804, + "task_loss": 0.250224232673645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5914482474327087, + "epoch": 5.75, + "learning_rate": 2.1238377007607776e-05, + "loss": 0.5198, + "step": 6805, + "task_loss": 1.0823137760162354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34581875801086426, + "epoch": 5.75, + "learning_rate": 2.1234150464919695e-05, + "loss": 0.3962, + "step": 6806, + "task_loss": 1.0628114938735962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6719406843185425, + "epoch": 5.75, + "learning_rate": 2.1229923922231615e-05, + "loss": 0.5499, + "step": 6807, + "task_loss": 0.465394526720047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5483078360557556, + "epoch": 5.75, + "learning_rate": 2.1225697379543535e-05, + "loss": 0.623, + "step": 6808, + "task_loss": 1.5166280269622803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5266494154930115, + "epoch": 5.76, + "learning_rate": 2.1221470836855455e-05, + "loss": 0.4517, + "step": 6809, + "task_loss": 0.45015373826026917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6125856041908264, + "epoch": 5.76, + "learning_rate": 2.121724429416737e-05, + "loss": 0.4907, + "step": 6810, + "task_loss": 0.13124988973140717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4500581622123718, + "epoch": 5.76, + "learning_rate": 2.121301775147929e-05, + "loss": 0.5281, + "step": 6811, + "task_loss": 0.13881105184555054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2730002999305725, + "epoch": 5.76, + "learning_rate": 2.120879120879121e-05, + "loss": 0.3554, + "step": 6812, + "task_loss": 0.5143589973449707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2998513877391815, + "epoch": 5.76, + "learning_rate": 2.120456466610313e-05, + "loss": 0.4916, + "step": 6813, + "task_loss": 0.7103821635246277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2632826566696167, + "epoch": 5.76, + "learning_rate": 2.1200338123415047e-05, + "loss": 0.2724, + "step": 6814, + "task_loss": 0.657924473285675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5065885782241821, + "epoch": 5.76, + "learning_rate": 2.1196111580726966e-05, + "loss": 0.4249, + "step": 6815, + "task_loss": 1.1604305505752563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3876535892486572, + "epoch": 5.76, + "learning_rate": 2.1191885038038886e-05, + "loss": 0.4899, + "step": 6816, + "task_loss": 0.879410445690155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5263018608093262, + "epoch": 5.76, + "learning_rate": 2.1187658495350802e-05, + "loss": 0.573, + "step": 6817, + "task_loss": 1.6745277643203735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37311917543411255, + "epoch": 5.76, + "learning_rate": 2.1183431952662722e-05, + "loss": 0.4211, + "step": 6818, + "task_loss": 0.46173784136772156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5550713539123535, + "epoch": 5.76, + "learning_rate": 2.1179205409974642e-05, + "loss": 0.4666, + "step": 6819, + "task_loss": 0.998883843421936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6994566321372986, + "epoch": 5.76, + "learning_rate": 2.117497886728656e-05, + "loss": 0.5161, + "step": 6820, + "task_loss": 0.5035641193389893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14464640617370605, + "epoch": 5.77, + "learning_rate": 2.117075232459848e-05, + "loss": 0.4098, + "step": 6821, + "task_loss": 0.3938554525375366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21956972777843475, + "epoch": 5.77, + "learning_rate": 2.1166525781910398e-05, + "loss": 0.4419, + "step": 6822, + "task_loss": 0.08559399098157883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3366870582103729, + "epoch": 5.77, + "learning_rate": 2.1162299239222317e-05, + "loss": 0.5934, + "step": 6823, + "task_loss": 0.6493999361991882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19049642980098724, + "epoch": 5.77, + "learning_rate": 2.1158072696534237e-05, + "loss": 0.3677, + "step": 6824, + "task_loss": 0.0458822026848793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5027066469192505, + "epoch": 5.77, + "learning_rate": 2.1153846153846154e-05, + "loss": 0.4415, + "step": 6825, + "task_loss": 1.1761420965194702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33831173181533813, + "epoch": 5.77, + "learning_rate": 2.1149619611158073e-05, + "loss": 0.4057, + "step": 6826, + "task_loss": 0.47489920258522034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6495150327682495, + "epoch": 5.77, + "learning_rate": 2.1145393068469993e-05, + "loss": 0.5032, + "step": 6827, + "task_loss": 0.2836555540561676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.739142656326294, + "epoch": 5.77, + "learning_rate": 2.114116652578191e-05, + "loss": 0.5101, + "step": 6828, + "task_loss": 0.9493443369865417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4600968658924103, + "epoch": 5.77, + "learning_rate": 2.1136939983093833e-05, + "loss": 0.487, + "step": 6829, + "task_loss": 0.22272975742816925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4258127510547638, + "epoch": 5.77, + "learning_rate": 2.113271344040575e-05, + "loss": 0.5072, + "step": 6830, + "task_loss": 0.09586464613676071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41531744599342346, + "epoch": 5.77, + "learning_rate": 2.112848689771767e-05, + "loss": 0.5008, + "step": 6831, + "task_loss": 0.41358157992362976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.0277512073516846, + "epoch": 5.77, + "learning_rate": 2.112426035502959e-05, + "loss": 0.6109, + "step": 6832, + "task_loss": 1.2728232145309448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3542889952659607, + "epoch": 5.78, + "learning_rate": 2.1120033812341505e-05, + "loss": 0.5373, + "step": 6833, + "task_loss": 1.1528089046478271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5336217284202576, + "epoch": 5.78, + "learning_rate": 2.1115807269653424e-05, + "loss": 0.4377, + "step": 6834, + "task_loss": 0.7546467781066895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3307652771472931, + "epoch": 5.78, + "learning_rate": 2.1111580726965344e-05, + "loss": 0.4796, + "step": 6835, + "task_loss": 0.6476584672927856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4327883720397949, + "epoch": 5.78, + "learning_rate": 2.110735418427726e-05, + "loss": 0.462, + "step": 6836, + "task_loss": 0.2784275710582733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24504417181015015, + "epoch": 5.78, + "learning_rate": 2.1103127641589184e-05, + "loss": 0.4415, + "step": 6837, + "task_loss": 0.560055136680603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4066360890865326, + "epoch": 5.78, + "learning_rate": 2.10989010989011e-05, + "loss": 0.6412, + "step": 6838, + "task_loss": 0.6960163712501526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5056667327880859, + "epoch": 5.78, + "learning_rate": 2.1094674556213016e-05, + "loss": 0.381, + "step": 6839, + "task_loss": 1.0946102142333984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7472718358039856, + "epoch": 5.78, + "learning_rate": 2.109044801352494e-05, + "loss": 0.5438, + "step": 6840, + "task_loss": 1.369212031364441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3987520635128021, + "epoch": 5.78, + "learning_rate": 2.1086221470836856e-05, + "loss": 0.4121, + "step": 6841, + "task_loss": 0.6876682639122009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2448863387107849, + "epoch": 5.78, + "learning_rate": 2.1081994928148776e-05, + "loss": 0.4465, + "step": 6842, + "task_loss": 0.3026348352432251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6523715257644653, + "epoch": 5.78, + "learning_rate": 2.1077768385460695e-05, + "loss": 0.5269, + "step": 6843, + "task_loss": 0.2225721776485443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5302640795707703, + "epoch": 5.78, + "learning_rate": 2.107354184277261e-05, + "loss": 0.4623, + "step": 6844, + "task_loss": 1.5482683181762695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7110546827316284, + "epoch": 5.79, + "learning_rate": 2.106931530008453e-05, + "loss": 0.4753, + "step": 6845, + "task_loss": 0.47345587611198425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3790833652019501, + "epoch": 5.79, + "learning_rate": 2.106508875739645e-05, + "loss": 0.3771, + "step": 6846, + "task_loss": 0.20857547223567963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3964225649833679, + "epoch": 5.79, + "learning_rate": 2.1060862214708367e-05, + "loss": 0.4294, + "step": 6847, + "task_loss": 0.14243261516094208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41636669635772705, + "epoch": 5.79, + "learning_rate": 2.105663567202029e-05, + "loss": 0.4275, + "step": 6848, + "task_loss": 0.566111147403717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48904043436050415, + "epoch": 5.79, + "learning_rate": 2.1052409129332207e-05, + "loss": 0.3679, + "step": 6849, + "task_loss": 1.3460726737976074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35177889466285706, + "epoch": 5.79, + "learning_rate": 2.1048182586644127e-05, + "loss": 0.3922, + "step": 6850, + "task_loss": 0.29798099398612976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5769786834716797, + "epoch": 5.79, + "learning_rate": 2.1043956043956046e-05, + "loss": 0.4667, + "step": 6851, + "task_loss": 0.3875521719455719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31912124156951904, + "epoch": 5.79, + "learning_rate": 2.1039729501267963e-05, + "loss": 0.4313, + "step": 6852, + "task_loss": 0.32382670044898987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38843244314193726, + "epoch": 5.79, + "learning_rate": 2.1035502958579883e-05, + "loss": 0.4252, + "step": 6853, + "task_loss": 1.0679205656051636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3662281334400177, + "epoch": 5.79, + "learning_rate": 2.1031276415891802e-05, + "loss": 0.5041, + "step": 6854, + "task_loss": 0.6846098303794861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5711381435394287, + "epoch": 5.79, + "learning_rate": 2.102704987320372e-05, + "loss": 0.5762, + "step": 6855, + "task_loss": 0.9002973437309265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5546905994415283, + "epoch": 5.79, + "learning_rate": 2.102282333051564e-05, + "loss": 0.3673, + "step": 6856, + "task_loss": 0.6081010103225708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9307261109352112, + "epoch": 5.8, + "learning_rate": 2.1018596787827558e-05, + "loss": 0.5543, + "step": 6857, + "task_loss": 1.546901822090149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8297597169876099, + "epoch": 5.8, + "learning_rate": 2.1014370245139478e-05, + "loss": 0.6043, + "step": 6858, + "task_loss": 1.2760603427886963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.324114590883255, + "epoch": 5.8, + "learning_rate": 2.1010143702451394e-05, + "loss": 0.4176, + "step": 6859, + "task_loss": 0.6934968829154968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4704315662384033, + "epoch": 5.8, + "learning_rate": 2.1005917159763314e-05, + "loss": 0.4553, + "step": 6860, + "task_loss": 0.8126189708709717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24674999713897705, + "epoch": 5.8, + "learning_rate": 2.1001690617075234e-05, + "loss": 0.3397, + "step": 6861, + "task_loss": 0.5640186071395874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5420013666152954, + "epoch": 5.8, + "learning_rate": 2.0997464074387153e-05, + "loss": 0.4965, + "step": 6862, + "task_loss": 1.0041961669921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5000835061073303, + "epoch": 5.8, + "learning_rate": 2.099323753169907e-05, + "loss": 0.4163, + "step": 6863, + "task_loss": 1.16087007522583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6482635736465454, + "epoch": 5.8, + "learning_rate": 2.098901098901099e-05, + "loss": 0.4986, + "step": 6864, + "task_loss": 0.6807727813720703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15824642777442932, + "epoch": 5.8, + "learning_rate": 2.098478444632291e-05, + "loss": 0.4103, + "step": 6865, + "task_loss": 0.007307421416044235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5773793458938599, + "epoch": 5.8, + "learning_rate": 2.098055790363483e-05, + "loss": 0.4147, + "step": 6866, + "task_loss": 0.7449727654457092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3035845160484314, + "epoch": 5.8, + "learning_rate": 2.0976331360946745e-05, + "loss": 0.5636, + "step": 6867, + "task_loss": 0.35404837131500244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4838506877422333, + "epoch": 5.81, + "learning_rate": 2.0972104818258665e-05, + "loss": 0.4828, + "step": 6868, + "task_loss": 1.5215617418289185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.417070209980011, + "epoch": 5.81, + "learning_rate": 2.0967878275570585e-05, + "loss": 0.4846, + "step": 6869, + "task_loss": 0.8895341157913208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6175264120101929, + "epoch": 5.81, + "learning_rate": 2.09636517328825e-05, + "loss": 0.4822, + "step": 6870, + "task_loss": 0.7183728814125061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4873811602592468, + "epoch": 5.81, + "learning_rate": 2.0959425190194424e-05, + "loss": 0.4678, + "step": 6871, + "task_loss": 0.606157660484314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5087961554527283, + "epoch": 5.81, + "learning_rate": 2.095519864750634e-05, + "loss": 0.4193, + "step": 6872, + "task_loss": 1.1622354984283447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5356823801994324, + "epoch": 5.81, + "learning_rate": 2.095097210481826e-05, + "loss": 0.487, + "step": 6873, + "task_loss": 0.3554946482181549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29089847207069397, + "epoch": 5.81, + "learning_rate": 2.094674556213018e-05, + "loss": 0.324, + "step": 6874, + "task_loss": 0.3924228250980377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24230298399925232, + "epoch": 5.81, + "learning_rate": 2.0942519019442096e-05, + "loss": 0.4417, + "step": 6875, + "task_loss": 0.322633296251297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7427131533622742, + "epoch": 5.81, + "learning_rate": 2.0938292476754016e-05, + "loss": 0.513, + "step": 6876, + "task_loss": 0.43294665217399597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41000860929489136, + "epoch": 5.81, + "learning_rate": 2.0934065934065936e-05, + "loss": 0.3547, + "step": 6877, + "task_loss": 0.518623411655426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32185161113739014, + "epoch": 5.81, + "learning_rate": 2.0929839391377852e-05, + "loss": 0.4657, + "step": 6878, + "task_loss": 0.6284551620483398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3538426160812378, + "epoch": 5.81, + "learning_rate": 2.0925612848689775e-05, + "loss": 0.5171, + "step": 6879, + "task_loss": 0.8997548818588257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43647468090057373, + "epoch": 5.82, + "learning_rate": 2.0921386306001692e-05, + "loss": 0.4572, + "step": 6880, + "task_loss": 0.1140788197517395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30595213174819946, + "epoch": 5.82, + "learning_rate": 2.0917159763313608e-05, + "loss": 0.4401, + "step": 6881, + "task_loss": 0.7451620101928711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33447811007499695, + "epoch": 5.82, + "learning_rate": 2.091293322062553e-05, + "loss": 0.4045, + "step": 6882, + "task_loss": 0.7599992156028748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4289398491382599, + "epoch": 5.82, + "learning_rate": 2.0908706677937448e-05, + "loss": 0.4473, + "step": 6883, + "task_loss": 0.5802550315856934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4537455439567566, + "epoch": 5.82, + "learning_rate": 2.0904480135249367e-05, + "loss": 0.3273, + "step": 6884, + "task_loss": 0.8239174485206604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23544958233833313, + "epoch": 5.82, + "learning_rate": 2.0900253592561287e-05, + "loss": 0.3563, + "step": 6885, + "task_loss": 0.09476306289434433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3304581046104431, + "epoch": 5.82, + "learning_rate": 2.0896027049873203e-05, + "loss": 0.4708, + "step": 6886, + "task_loss": 0.8620635867118835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6766111254692078, + "epoch": 5.82, + "learning_rate": 2.0891800507185123e-05, + "loss": 0.5107, + "step": 6887, + "task_loss": 1.0529582500457764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30495965480804443, + "epoch": 5.82, + "learning_rate": 2.0887573964497043e-05, + "loss": 0.3258, + "step": 6888, + "task_loss": 0.3499838709831238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6093863844871521, + "epoch": 5.82, + "learning_rate": 2.088334742180896e-05, + "loss": 0.4456, + "step": 6889, + "task_loss": 0.4287920594215393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3391032814979553, + "epoch": 5.82, + "learning_rate": 2.0879120879120882e-05, + "loss": 0.4857, + "step": 6890, + "task_loss": 0.9017887115478516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36752891540527344, + "epoch": 5.82, + "learning_rate": 2.08748943364328e-05, + "loss": 0.4111, + "step": 6891, + "task_loss": 0.4394024908542633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3980378806591034, + "epoch": 5.83, + "learning_rate": 2.0870667793744715e-05, + "loss": 0.4146, + "step": 6892, + "task_loss": 0.021777141839265823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3142988681793213, + "epoch": 5.83, + "learning_rate": 2.0866441251056638e-05, + "loss": 0.4208, + "step": 6893, + "task_loss": 0.27094897627830505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4553554356098175, + "epoch": 5.83, + "learning_rate": 2.0862214708368555e-05, + "loss": 0.5404, + "step": 6894, + "task_loss": 1.224256157875061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2626907527446747, + "epoch": 5.83, + "learning_rate": 2.0857988165680474e-05, + "loss": 0.3815, + "step": 6895, + "task_loss": 0.855850100517273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.531815230846405, + "epoch": 5.83, + "learning_rate": 2.0853761622992394e-05, + "loss": 0.5996, + "step": 6896, + "task_loss": 0.5966977477073669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31711798906326294, + "epoch": 5.83, + "learning_rate": 2.084953508030431e-05, + "loss": 0.4046, + "step": 6897, + "task_loss": 0.6825155019760132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38439205288887024, + "epoch": 5.83, + "learning_rate": 2.084530853761623e-05, + "loss": 0.5293, + "step": 6898, + "task_loss": 0.8456814885139465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15853556990623474, + "epoch": 5.83, + "learning_rate": 2.084108199492815e-05, + "loss": 0.3849, + "step": 6899, + "task_loss": 0.06816264241933823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5972846746444702, + "epoch": 5.83, + "learning_rate": 2.083685545224007e-05, + "loss": 0.4852, + "step": 6900, + "task_loss": 1.376851201057434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.455689936876297, + "epoch": 5.83, + "learning_rate": 2.083262890955199e-05, + "loss": 0.4725, + "step": 6901, + "task_loss": 0.5702486038208008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46228501200675964, + "epoch": 5.83, + "learning_rate": 2.0828402366863906e-05, + "loss": 0.4756, + "step": 6902, + "task_loss": 0.4654257893562317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3355134129524231, + "epoch": 5.83, + "learning_rate": 2.0824175824175825e-05, + "loss": 0.4657, + "step": 6903, + "task_loss": 1.098803162574768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2770090103149414, + "epoch": 5.84, + "learning_rate": 2.0819949281487745e-05, + "loss": 0.3887, + "step": 6904, + "task_loss": 0.6073124408721924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2100699096918106, + "epoch": 5.84, + "learning_rate": 2.081572273879966e-05, + "loss": 0.3216, + "step": 6905, + "task_loss": 0.21566464006900787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29921281337738037, + "epoch": 5.84, + "learning_rate": 2.081149619611158e-05, + "loss": 0.3604, + "step": 6906, + "task_loss": 0.45757758617401123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25455623865127563, + "epoch": 5.84, + "learning_rate": 2.08072696534235e-05, + "loss": 0.3524, + "step": 6907, + "task_loss": 0.07059229165315628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5089364051818848, + "epoch": 5.84, + "learning_rate": 2.080304311073542e-05, + "loss": 0.4101, + "step": 6908, + "task_loss": 0.5950535535812378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44800323247909546, + "epoch": 5.84, + "learning_rate": 2.0798816568047337e-05, + "loss": 0.4616, + "step": 6909, + "task_loss": 0.80549156665802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3771620988845825, + "epoch": 5.84, + "learning_rate": 2.0794590025359257e-05, + "loss": 0.4364, + "step": 6910, + "task_loss": 1.1688623428344727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.546317458152771, + "epoch": 5.84, + "learning_rate": 2.0790363482671177e-05, + "loss": 0.5008, + "step": 6911, + "task_loss": 0.38493847846984863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48115646839141846, + "epoch": 5.84, + "learning_rate": 2.0786136939983096e-05, + "loss": 0.5804, + "step": 6912, + "task_loss": 1.0825543403625488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44323229789733887, + "epoch": 5.84, + "learning_rate": 2.0781910397295013e-05, + "loss": 0.462, + "step": 6913, + "task_loss": 1.199125051498413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.680584192276001, + "epoch": 5.84, + "learning_rate": 2.0777683854606932e-05, + "loss": 0.4516, + "step": 6914, + "task_loss": 0.4622798562049866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29810604453086853, + "epoch": 5.84, + "learning_rate": 2.0773457311918852e-05, + "loss": 0.4058, + "step": 6915, + "task_loss": 0.6172472238540649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6477153301239014, + "epoch": 5.85, + "learning_rate": 2.0769230769230772e-05, + "loss": 0.4511, + "step": 6916, + "task_loss": 0.7039207816123962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23615384101867676, + "epoch": 5.85, + "learning_rate": 2.0765004226542688e-05, + "loss": 0.362, + "step": 6917, + "task_loss": 0.24976134300231934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43328922986984253, + "epoch": 5.85, + "learning_rate": 2.0760777683854608e-05, + "loss": 0.5557, + "step": 6918, + "task_loss": 0.6124016642570496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.678520679473877, + "epoch": 5.85, + "learning_rate": 2.0756551141166528e-05, + "loss": 0.5384, + "step": 6919, + "task_loss": 0.5957140922546387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5203990340232849, + "epoch": 5.85, + "learning_rate": 2.0752324598478444e-05, + "loss": 0.4904, + "step": 6920, + "task_loss": 1.1569639444351196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.555241048336029, + "epoch": 5.85, + "learning_rate": 2.0748098055790364e-05, + "loss": 0.4127, + "step": 6921, + "task_loss": 0.5554826855659485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3438481092453003, + "epoch": 5.85, + "learning_rate": 2.0743871513102283e-05, + "loss": 0.4762, + "step": 6922, + "task_loss": 0.9312421679496765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6765172481536865, + "epoch": 5.85, + "learning_rate": 2.07396449704142e-05, + "loss": 0.4982, + "step": 6923, + "task_loss": 0.7163085341453552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33975645899772644, + "epoch": 5.85, + "learning_rate": 2.0735418427726123e-05, + "loss": 0.3875, + "step": 6924, + "task_loss": 0.3946218192577362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4212139844894409, + "epoch": 5.85, + "learning_rate": 2.073119188503804e-05, + "loss": 0.5296, + "step": 6925, + "task_loss": 0.18994076550006866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2789541482925415, + "epoch": 5.85, + "learning_rate": 2.072696534234996e-05, + "loss": 0.3817, + "step": 6926, + "task_loss": 0.21240371465682983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8152321577072144, + "epoch": 5.85, + "learning_rate": 2.072273879966188e-05, + "loss": 0.6065, + "step": 6927, + "task_loss": 0.381613552570343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4205235242843628, + "epoch": 5.86, + "learning_rate": 2.0718512256973795e-05, + "loss": 0.4891, + "step": 6928, + "task_loss": 0.697672426700592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2164396047592163, + "epoch": 5.86, + "learning_rate": 2.0714285714285718e-05, + "loss": 0.4187, + "step": 6929, + "task_loss": 0.35104691982269287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5050743818283081, + "epoch": 5.86, + "learning_rate": 2.0710059171597635e-05, + "loss": 0.4805, + "step": 6930, + "task_loss": 0.3537675142288208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.831802248954773, + "epoch": 5.86, + "learning_rate": 2.070583262890955e-05, + "loss": 0.6558, + "step": 6931, + "task_loss": 1.1591541767120361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39277392625808716, + "epoch": 5.86, + "learning_rate": 2.0701606086221474e-05, + "loss": 0.5059, + "step": 6932, + "task_loss": 1.1958526372909546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4644114673137665, + "epoch": 5.86, + "learning_rate": 2.069737954353339e-05, + "loss": 0.5094, + "step": 6933, + "task_loss": 0.4394320249557495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.415141761302948, + "epoch": 5.86, + "learning_rate": 2.0693153000845307e-05, + "loss": 0.482, + "step": 6934, + "task_loss": 0.2355649620294571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5091354846954346, + "epoch": 5.86, + "learning_rate": 2.068892645815723e-05, + "loss": 0.3388, + "step": 6935, + "task_loss": 0.8844378590583801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6866999268531799, + "epoch": 5.86, + "learning_rate": 2.0684699915469146e-05, + "loss": 0.4656, + "step": 6936, + "task_loss": 0.6222223043441772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4982926547527313, + "epoch": 5.86, + "learning_rate": 2.0680473372781066e-05, + "loss": 0.4589, + "step": 6937, + "task_loss": 0.4388187527656555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3379089832305908, + "epoch": 5.86, + "learning_rate": 2.0676246830092986e-05, + "loss": 0.3769, + "step": 6938, + "task_loss": 0.06551823765039444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2937958240509033, + "epoch": 5.87, + "learning_rate": 2.0672020287404902e-05, + "loss": 0.3521, + "step": 6939, + "task_loss": 0.5402923822402954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5408083200454712, + "epoch": 5.87, + "learning_rate": 2.0667793744716822e-05, + "loss": 0.46, + "step": 6940, + "task_loss": 0.49334830045700073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34332144260406494, + "epoch": 5.87, + "learning_rate": 2.066356720202874e-05, + "loss": 0.4943, + "step": 6941, + "task_loss": 0.8118143677711487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4961376190185547, + "epoch": 5.87, + "learning_rate": 2.0659340659340658e-05, + "loss": 0.513, + "step": 6942, + "task_loss": 0.12239488959312439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48297691345214844, + "epoch": 5.87, + "learning_rate": 2.065511411665258e-05, + "loss": 0.5205, + "step": 6943, + "task_loss": 0.24743522703647614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3032165467739105, + "epoch": 5.87, + "learning_rate": 2.0650887573964497e-05, + "loss": 0.4141, + "step": 6944, + "task_loss": 0.8442246317863464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2457922101020813, + "epoch": 5.87, + "learning_rate": 2.0646661031276417e-05, + "loss": 0.4021, + "step": 6945, + "task_loss": 0.9032883644104004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2674260437488556, + "epoch": 5.87, + "learning_rate": 2.0642434488588337e-05, + "loss": 0.4044, + "step": 6946, + "task_loss": 0.4631931781768799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5523655414581299, + "epoch": 5.87, + "learning_rate": 2.0638207945900253e-05, + "loss": 0.4694, + "step": 6947, + "task_loss": 0.5869618058204651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5839467644691467, + "epoch": 5.87, + "learning_rate": 2.0633981403212173e-05, + "loss": 0.5043, + "step": 6948, + "task_loss": 1.1385022401809692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2565206289291382, + "epoch": 5.87, + "learning_rate": 2.0629754860524093e-05, + "loss": 0.3683, + "step": 6949, + "task_loss": 0.17148961126804352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6339231729507446, + "epoch": 5.87, + "learning_rate": 2.062552831783601e-05, + "loss": 0.5042, + "step": 6950, + "task_loss": 0.5123355984687805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5382684469223022, + "epoch": 5.88, + "learning_rate": 2.062130177514793e-05, + "loss": 0.5302, + "step": 6951, + "task_loss": 1.1977189779281616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2415362298488617, + "epoch": 5.88, + "learning_rate": 2.061707523245985e-05, + "loss": 0.4493, + "step": 6952, + "task_loss": 0.3175063729286194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4160362482070923, + "epoch": 5.88, + "learning_rate": 2.0612848689771768e-05, + "loss": 0.3986, + "step": 6953, + "task_loss": 1.3243874311447144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.224104106426239, + "epoch": 5.88, + "learning_rate": 2.0608622147083688e-05, + "loss": 0.4484, + "step": 6954, + "task_loss": 0.07137873768806458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5895787477493286, + "epoch": 5.88, + "learning_rate": 2.0604395604395604e-05, + "loss": 0.4326, + "step": 6955, + "task_loss": 0.6851025819778442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5832515954971313, + "epoch": 5.88, + "learning_rate": 2.0600169061707524e-05, + "loss": 0.5185, + "step": 6956, + "task_loss": 0.5652205944061279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5279256105422974, + "epoch": 5.88, + "learning_rate": 2.0595942519019444e-05, + "loss": 0.5575, + "step": 6957, + "task_loss": 0.8310483694076538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4228052794933319, + "epoch": 5.88, + "learning_rate": 2.0591715976331364e-05, + "loss": 0.3808, + "step": 6958, + "task_loss": 0.7282111644744873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47618064284324646, + "epoch": 5.88, + "learning_rate": 2.058748943364328e-05, + "loss": 0.5138, + "step": 6959, + "task_loss": 0.8880327343940735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39084964990615845, + "epoch": 5.88, + "learning_rate": 2.05832628909552e-05, + "loss": 0.5013, + "step": 6960, + "task_loss": 1.4400535821914673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.70611572265625, + "epoch": 5.88, + "learning_rate": 2.057903634826712e-05, + "loss": 0.5149, + "step": 6961, + "task_loss": 0.5581108927726746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33923280239105225, + "epoch": 5.88, + "learning_rate": 2.0574809805579036e-05, + "loss": 0.4218, + "step": 6962, + "task_loss": 0.6265366077423096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.294910728931427, + "epoch": 5.89, + "learning_rate": 2.0570583262890956e-05, + "loss": 0.4683, + "step": 6963, + "task_loss": 0.3881951868534088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.249252051115036, + "epoch": 5.89, + "learning_rate": 2.0566356720202875e-05, + "loss": 0.3408, + "step": 6964, + "task_loss": 0.16938893496990204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34031033515930176, + "epoch": 5.89, + "learning_rate": 2.0562130177514795e-05, + "loss": 0.5201, + "step": 6965, + "task_loss": 1.5841153860092163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2765214145183563, + "epoch": 5.89, + "learning_rate": 2.0557903634826715e-05, + "loss": 0.4168, + "step": 6966, + "task_loss": 0.14913053810596466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6033191680908203, + "epoch": 5.89, + "learning_rate": 2.055367709213863e-05, + "loss": 0.5496, + "step": 6967, + "task_loss": 0.7337641716003418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2983512580394745, + "epoch": 5.89, + "learning_rate": 2.054945054945055e-05, + "loss": 0.3941, + "step": 6968, + "task_loss": 0.4973376989364624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5667001008987427, + "epoch": 5.89, + "learning_rate": 2.054522400676247e-05, + "loss": 0.4624, + "step": 6969, + "task_loss": 1.239099383354187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15192481875419617, + "epoch": 5.89, + "learning_rate": 2.0540997464074387e-05, + "loss": 0.2596, + "step": 6970, + "task_loss": 0.08259785920381546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3858349919319153, + "epoch": 5.89, + "learning_rate": 2.0536770921386307e-05, + "loss": 0.4214, + "step": 6971, + "task_loss": 0.36624372005462646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5908135771751404, + "epoch": 5.89, + "learning_rate": 2.0532544378698226e-05, + "loss": 0.4897, + "step": 6972, + "task_loss": 1.1913702487945557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3814309239387512, + "epoch": 5.89, + "learning_rate": 2.0528317836010143e-05, + "loss": 0.3859, + "step": 6973, + "task_loss": 0.7360821962356567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.465462863445282, + "epoch": 5.89, + "learning_rate": 2.0524091293322066e-05, + "loss": 0.5608, + "step": 6974, + "task_loss": 1.012315273284912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5127521753311157, + "epoch": 5.9, + "learning_rate": 2.0519864750633982e-05, + "loss": 0.5553, + "step": 6975, + "task_loss": 0.7014459371566772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4636867046356201, + "epoch": 5.9, + "learning_rate": 2.0515638207945902e-05, + "loss": 0.4922, + "step": 6976, + "task_loss": 0.5733063817024231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5170900821685791, + "epoch": 5.9, + "learning_rate": 2.051141166525782e-05, + "loss": 0.6136, + "step": 6977, + "task_loss": 1.0354149341583252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41047024726867676, + "epoch": 5.9, + "learning_rate": 2.0507185122569738e-05, + "loss": 0.6041, + "step": 6978, + "task_loss": 0.6598640084266663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1919073611497879, + "epoch": 5.9, + "learning_rate": 2.0502958579881658e-05, + "loss": 0.5014, + "step": 6979, + "task_loss": 0.19847360253334045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49224385619163513, + "epoch": 5.9, + "learning_rate": 2.0498732037193578e-05, + "loss": 0.4844, + "step": 6980, + "task_loss": 0.5532199740409851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5924396514892578, + "epoch": 5.9, + "learning_rate": 2.0494505494505494e-05, + "loss": 0.4928, + "step": 6981, + "task_loss": 0.510155439376831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2171267867088318, + "epoch": 5.9, + "learning_rate": 2.0490278951817417e-05, + "loss": 0.4683, + "step": 6982, + "task_loss": 0.4478294253349304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31343963742256165, + "epoch": 5.9, + "learning_rate": 2.0486052409129333e-05, + "loss": 0.5461, + "step": 6983, + "task_loss": 0.28888171911239624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30224156379699707, + "epoch": 5.9, + "learning_rate": 2.048182586644125e-05, + "loss": 0.4037, + "step": 6984, + "task_loss": 0.49089041352272034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33464598655700684, + "epoch": 5.9, + "learning_rate": 2.0477599323753173e-05, + "loss": 0.44, + "step": 6985, + "task_loss": 0.26264408230781555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4295983910560608, + "epoch": 5.9, + "learning_rate": 2.047337278106509e-05, + "loss": 0.3686, + "step": 6986, + "task_loss": 0.5043914914131165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29914841055870056, + "epoch": 5.91, + "learning_rate": 2.046914623837701e-05, + "loss": 0.3713, + "step": 6987, + "task_loss": 0.22969037294387817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.665727972984314, + "epoch": 5.91, + "learning_rate": 2.046491969568893e-05, + "loss": 0.5371, + "step": 6988, + "task_loss": 1.1917181015014648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3700661063194275, + "epoch": 5.91, + "learning_rate": 2.0460693153000845e-05, + "loss": 0.3794, + "step": 6989, + "task_loss": 0.5996010899543762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48665523529052734, + "epoch": 5.91, + "learning_rate": 2.0456466610312765e-05, + "loss": 0.4218, + "step": 6990, + "task_loss": 1.3244385719299316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4173729121685028, + "epoch": 5.91, + "learning_rate": 2.0452240067624684e-05, + "loss": 0.3932, + "step": 6991, + "task_loss": 1.0321753025054932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5391624569892883, + "epoch": 5.91, + "learning_rate": 2.04480135249366e-05, + "loss": 0.4709, + "step": 6992, + "task_loss": 0.3678254187107086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34529024362564087, + "epoch": 5.91, + "learning_rate": 2.0443786982248524e-05, + "loss": 0.4229, + "step": 6993, + "task_loss": 0.4369000792503357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3824813961982727, + "epoch": 5.91, + "learning_rate": 2.043956043956044e-05, + "loss": 0.4521, + "step": 6994, + "task_loss": 1.2072104215621948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6146224737167358, + "epoch": 5.91, + "learning_rate": 2.043533389687236e-05, + "loss": 0.5808, + "step": 6995, + "task_loss": 1.2046154737472534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29365992546081543, + "epoch": 5.91, + "learning_rate": 2.043110735418428e-05, + "loss": 0.4759, + "step": 6996, + "task_loss": 0.5661774277687073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5630391836166382, + "epoch": 5.91, + "learning_rate": 2.0426880811496196e-05, + "loss": 0.4977, + "step": 6997, + "task_loss": 1.1067628860473633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5411484241485596, + "epoch": 5.91, + "learning_rate": 2.0422654268808116e-05, + "loss": 0.5681, + "step": 6998, + "task_loss": 0.7602152824401855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5079545974731445, + "epoch": 5.92, + "learning_rate": 2.0418427726120036e-05, + "loss": 0.5346, + "step": 6999, + "task_loss": 0.4370797872543335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45872044563293457, + "epoch": 5.92, + "learning_rate": 2.0414201183431952e-05, + "loss": 0.4423, + "step": 7000, + "task_loss": 0.49306046962738037 + }, + { + "epoch": 5.92, + "eval_accuracy": 0.9091881188118812, + "eval_loss": 0.30122703313827515, + "eval_runtime": 229.1684, + "eval_samples_per_second": 110.181, + "eval_steps_per_second": 0.864, + "step": 7000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38473546504974365, + "epoch": 5.92, + "learning_rate": 2.0409974640743872e-05, + "loss": 0.5171, + "step": 7001, + "task_loss": 0.6840525269508362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5666130185127258, + "epoch": 5.92, + "learning_rate": 2.040574809805579e-05, + "loss": 0.5257, + "step": 7002, + "task_loss": 0.4802851974964142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36271658539772034, + "epoch": 5.92, + "learning_rate": 2.040152155536771e-05, + "loss": 0.4802, + "step": 7003, + "task_loss": 0.7734671235084534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5573059320449829, + "epoch": 5.92, + "learning_rate": 2.0397295012679628e-05, + "loss": 0.511, + "step": 7004, + "task_loss": 0.5650498270988464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2098025381565094, + "epoch": 5.92, + "learning_rate": 2.0393068469991547e-05, + "loss": 0.5231, + "step": 7005, + "task_loss": 0.08263315260410309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2708004415035248, + "epoch": 5.92, + "learning_rate": 2.0388841927303467e-05, + "loss": 0.4776, + "step": 7006, + "task_loss": 0.03214747831225395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7136062383651733, + "epoch": 5.92, + "learning_rate": 2.0384615384615387e-05, + "loss": 0.5654, + "step": 7007, + "task_loss": 1.1165952682495117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36361658573150635, + "epoch": 5.92, + "learning_rate": 2.0380388841927303e-05, + "loss": 0.5594, + "step": 7008, + "task_loss": 0.8413437008857727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1874292641878128, + "epoch": 5.92, + "learning_rate": 2.0376162299239223e-05, + "loss": 0.3775, + "step": 7009, + "task_loss": 0.44430458545684814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.037595510482788, + "epoch": 5.93, + "learning_rate": 2.0371935756551143e-05, + "loss": 0.534, + "step": 7010, + "task_loss": 1.4111241102218628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3515136241912842, + "epoch": 5.93, + "learning_rate": 2.0367709213863062e-05, + "loss": 0.3286, + "step": 7011, + "task_loss": 0.40704452991485596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6027913093566895, + "epoch": 5.93, + "learning_rate": 2.036348267117498e-05, + "loss": 0.5005, + "step": 7012, + "task_loss": 0.6376664638519287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4948940873146057, + "epoch": 5.93, + "learning_rate": 2.03592561284869e-05, + "loss": 0.5056, + "step": 7013, + "task_loss": 0.7439824342727661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5963187217712402, + "epoch": 5.93, + "learning_rate": 2.0355029585798818e-05, + "loss": 0.4659, + "step": 7014, + "task_loss": 0.7507460117340088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7237182855606079, + "epoch": 5.93, + "learning_rate": 2.0350803043110734e-05, + "loss": 0.6112, + "step": 7015, + "task_loss": 0.6702473163604736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29223304986953735, + "epoch": 5.93, + "learning_rate": 2.0346576500422658e-05, + "loss": 0.4424, + "step": 7016, + "task_loss": 0.7347798347473145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46810343861579895, + "epoch": 5.93, + "learning_rate": 2.0342349957734574e-05, + "loss": 0.4677, + "step": 7017, + "task_loss": 0.37034061551094055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6313290596008301, + "epoch": 5.93, + "learning_rate": 2.0338123415046494e-05, + "loss": 0.5498, + "step": 7018, + "task_loss": 0.4862879812717438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36205318570137024, + "epoch": 5.93, + "learning_rate": 2.0333896872358413e-05, + "loss": 0.419, + "step": 7019, + "task_loss": 0.7600081562995911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4158157706260681, + "epoch": 5.93, + "learning_rate": 2.032967032967033e-05, + "loss": 0.3667, + "step": 7020, + "task_loss": 0.4066627025604248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4933263063430786, + "epoch": 5.93, + "learning_rate": 2.032544378698225e-05, + "loss": 0.4776, + "step": 7021, + "task_loss": 0.3642802834510803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3373827338218689, + "epoch": 5.94, + "learning_rate": 2.032121724429417e-05, + "loss": 0.4252, + "step": 7022, + "task_loss": 0.8688920736312866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3612859845161438, + "epoch": 5.94, + "learning_rate": 2.0316990701606086e-05, + "loss": 0.4093, + "step": 7023, + "task_loss": 0.61452317237854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28338301181793213, + "epoch": 5.94, + "learning_rate": 2.031276415891801e-05, + "loss": 0.3613, + "step": 7024, + "task_loss": 0.6012794971466064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43837082386016846, + "epoch": 5.94, + "learning_rate": 2.0308537616229925e-05, + "loss": 0.4128, + "step": 7025, + "task_loss": 0.5211491584777832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45048001408576965, + "epoch": 5.94, + "learning_rate": 2.030431107354184e-05, + "loss": 0.4912, + "step": 7026, + "task_loss": 0.1650909036397934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5176198482513428, + "epoch": 5.94, + "learning_rate": 2.0300084530853765e-05, + "loss": 0.37, + "step": 7027, + "task_loss": 1.0422096252441406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43670836091041565, + "epoch": 5.94, + "learning_rate": 2.029585798816568e-05, + "loss": 0.4958, + "step": 7028, + "task_loss": 0.7880675196647644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8128395080566406, + "epoch": 5.94, + "learning_rate": 2.02916314454776e-05, + "loss": 0.529, + "step": 7029, + "task_loss": 1.2211318016052246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4492298364639282, + "epoch": 5.94, + "learning_rate": 2.028740490278952e-05, + "loss": 0.4517, + "step": 7030, + "task_loss": 0.5898701548576355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3042801022529602, + "epoch": 5.94, + "learning_rate": 2.0283178360101437e-05, + "loss": 0.4507, + "step": 7031, + "task_loss": 0.8023127913475037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5930072069168091, + "epoch": 5.94, + "learning_rate": 2.0278951817413356e-05, + "loss": 0.4801, + "step": 7032, + "task_loss": 1.1964970827102661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5841115713119507, + "epoch": 5.94, + "learning_rate": 2.0274725274725276e-05, + "loss": 0.5234, + "step": 7033, + "task_loss": 1.2644555568695068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.528667688369751, + "epoch": 5.95, + "learning_rate": 2.0270498732037193e-05, + "loss": 0.4742, + "step": 7034, + "task_loss": 0.5575354099273682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5134127736091614, + "epoch": 5.95, + "learning_rate": 2.0266272189349116e-05, + "loss": 0.4775, + "step": 7035, + "task_loss": 0.23523935675621033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47749000787734985, + "epoch": 5.95, + "learning_rate": 2.0262045646661032e-05, + "loss": 0.5308, + "step": 7036, + "task_loss": 0.5624517798423767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5227967500686646, + "epoch": 5.95, + "learning_rate": 2.025781910397295e-05, + "loss": 0.5867, + "step": 7037, + "task_loss": 0.32910606265068054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3845481276512146, + "epoch": 5.95, + "learning_rate": 2.025359256128487e-05, + "loss": 0.4294, + "step": 7038, + "task_loss": 0.6235036849975586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5552660226821899, + "epoch": 5.95, + "learning_rate": 2.0249366018596788e-05, + "loss": 0.5164, + "step": 7039, + "task_loss": 1.9431121349334717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4469095468521118, + "epoch": 5.95, + "learning_rate": 2.0245139475908708e-05, + "loss": 0.4224, + "step": 7040, + "task_loss": 0.23226583003997803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5176308155059814, + "epoch": 5.95, + "learning_rate": 2.0240912933220627e-05, + "loss": 0.4308, + "step": 7041, + "task_loss": 0.2558535933494568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5335114598274231, + "epoch": 5.95, + "learning_rate": 2.0236686390532544e-05, + "loss": 0.4012, + "step": 7042, + "task_loss": 0.44462838768959045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47712442278862, + "epoch": 5.95, + "learning_rate": 2.0232459847844463e-05, + "loss": 0.3853, + "step": 7043, + "task_loss": 1.314573884010315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5327378511428833, + "epoch": 5.95, + "learning_rate": 2.0228233305156383e-05, + "loss": 0.6413, + "step": 7044, + "task_loss": 1.06952965259552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.272204726934433, + "epoch": 5.95, + "learning_rate": 2.0224006762468303e-05, + "loss": 0.398, + "step": 7045, + "task_loss": 0.6054376363754272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4695577323436737, + "epoch": 5.96, + "learning_rate": 2.0219780219780223e-05, + "loss": 0.3974, + "step": 7046, + "task_loss": 0.14660142362117767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4648245573043823, + "epoch": 5.96, + "learning_rate": 2.021555367709214e-05, + "loss": 0.5524, + "step": 7047, + "task_loss": 0.7094641923904419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3122267723083496, + "epoch": 5.96, + "learning_rate": 2.021132713440406e-05, + "loss": 0.3384, + "step": 7048, + "task_loss": 1.0159333944320679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37727344036102295, + "epoch": 5.96, + "learning_rate": 2.020710059171598e-05, + "loss": 0.4847, + "step": 7049, + "task_loss": 0.6171429753303528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4436185359954834, + "epoch": 5.96, + "learning_rate": 2.0202874049027895e-05, + "loss": 0.3532, + "step": 7050, + "task_loss": 1.3608589172363281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5559684038162231, + "epoch": 5.96, + "learning_rate": 2.0198647506339815e-05, + "loss": 0.4823, + "step": 7051, + "task_loss": 0.3221512734889984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2834300994873047, + "epoch": 5.96, + "learning_rate": 2.0194420963651734e-05, + "loss": 0.4014, + "step": 7052, + "task_loss": 0.995764434337616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6256246566772461, + "epoch": 5.96, + "learning_rate": 2.0190194420963654e-05, + "loss": 0.53, + "step": 7053, + "task_loss": 0.5288268327713013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5749763250350952, + "epoch": 5.96, + "learning_rate": 2.018596787827557e-05, + "loss": 0.4529, + "step": 7054, + "task_loss": 1.5266227722167969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.513207197189331, + "epoch": 5.96, + "learning_rate": 2.018174133558749e-05, + "loss": 0.6147, + "step": 7055, + "task_loss": 0.6271253228187561 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41158199310302734, + "epoch": 5.96, + "learning_rate": 2.017751479289941e-05, + "loss": 0.46, + "step": 7056, + "task_loss": 0.3153323829174042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3458254337310791, + "epoch": 5.96, + "learning_rate": 2.017328825021133e-05, + "loss": 0.4521, + "step": 7057, + "task_loss": 0.6203638315200806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31062930822372437, + "epoch": 5.97, + "learning_rate": 2.0169061707523246e-05, + "loss": 0.3472, + "step": 7058, + "task_loss": 0.4665345549583435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3848392069339752, + "epoch": 5.97, + "learning_rate": 2.0164835164835166e-05, + "loss": 0.508, + "step": 7059, + "task_loss": 0.5620155334472656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4599146246910095, + "epoch": 5.97, + "learning_rate": 2.0160608622147085e-05, + "loss": 0.4409, + "step": 7060, + "task_loss": 0.867586076259613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42547208070755005, + "epoch": 5.97, + "learning_rate": 2.0156382079459005e-05, + "loss": 0.5965, + "step": 7061, + "task_loss": 0.8307308554649353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35151779651641846, + "epoch": 5.97, + "learning_rate": 2.015215553677092e-05, + "loss": 0.51, + "step": 7062, + "task_loss": 0.17829468846321106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24665094912052155, + "epoch": 5.97, + "learning_rate": 2.014792899408284e-05, + "loss": 0.396, + "step": 7063, + "task_loss": 0.21072585880756378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3609168529510498, + "epoch": 5.97, + "learning_rate": 2.014370245139476e-05, + "loss": 0.376, + "step": 7064, + "task_loss": 0.3609398305416107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.611214280128479, + "epoch": 5.97, + "learning_rate": 2.0139475908706677e-05, + "loss": 0.629, + "step": 7065, + "task_loss": 1.013014316558838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3529215455055237, + "epoch": 5.97, + "learning_rate": 2.0135249366018597e-05, + "loss": 0.4472, + "step": 7066, + "task_loss": 0.13304711878299713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8130069375038147, + "epoch": 5.97, + "learning_rate": 2.0131022823330517e-05, + "loss": 0.4689, + "step": 7067, + "task_loss": 0.9110226035118103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4751279652118683, + "epoch": 5.97, + "learning_rate": 2.0126796280642433e-05, + "loss": 0.3487, + "step": 7068, + "task_loss": 0.8028808236122131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41768959164619446, + "epoch": 5.97, + "learning_rate": 2.0122569737954356e-05, + "loss": 0.47, + "step": 7069, + "task_loss": 0.8124250173568726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2917250394821167, + "epoch": 5.98, + "learning_rate": 2.0118343195266273e-05, + "loss": 0.4199, + "step": 7070, + "task_loss": 0.29982373118400574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7308906316757202, + "epoch": 5.98, + "learning_rate": 2.0114116652578192e-05, + "loss": 0.4934, + "step": 7071, + "task_loss": 1.0027003288269043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26983407139778137, + "epoch": 5.98, + "learning_rate": 2.0109890109890112e-05, + "loss": 0.3916, + "step": 7072, + "task_loss": 0.830588698387146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.332949161529541, + "epoch": 5.98, + "learning_rate": 2.010566356720203e-05, + "loss": 0.4469, + "step": 7073, + "task_loss": 0.4459330439567566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28934359550476074, + "epoch": 5.98, + "learning_rate": 2.0101437024513948e-05, + "loss": 0.3807, + "step": 7074, + "task_loss": 0.2873598039150238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28923001885414124, + "epoch": 5.98, + "learning_rate": 2.0097210481825868e-05, + "loss": 0.4862, + "step": 7075, + "task_loss": 0.894254207611084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5095264315605164, + "epoch": 5.98, + "learning_rate": 2.0092983939137784e-05, + "loss": 0.3617, + "step": 7076, + "task_loss": 0.4393921196460724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26413285732269287, + "epoch": 5.98, + "learning_rate": 2.0088757396449707e-05, + "loss": 0.3413, + "step": 7077, + "task_loss": 0.42026636004447937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2640991508960724, + "epoch": 5.98, + "learning_rate": 2.0084530853761624e-05, + "loss": 0.3601, + "step": 7078, + "task_loss": 0.1901407092809677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3372025191783905, + "epoch": 5.98, + "learning_rate": 2.008030431107354e-05, + "loss": 0.4201, + "step": 7079, + "task_loss": 0.723630964756012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3172477185726166, + "epoch": 5.98, + "learning_rate": 2.0076077768385463e-05, + "loss": 0.5028, + "step": 7080, + "task_loss": 0.7342675924301147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44924086332321167, + "epoch": 5.99, + "learning_rate": 2.007185122569738e-05, + "loss": 0.4042, + "step": 7081, + "task_loss": 0.4796159565448761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3566740155220032, + "epoch": 5.99, + "learning_rate": 2.00676246830093e-05, + "loss": 0.2899, + "step": 7082, + "task_loss": 0.8122480511665344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5804363489151001, + "epoch": 5.99, + "learning_rate": 2.006339814032122e-05, + "loss": 0.5362, + "step": 7083, + "task_loss": 0.8325528502464294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.477544903755188, + "epoch": 5.99, + "learning_rate": 2.0059171597633135e-05, + "loss": 0.4969, + "step": 7084, + "task_loss": 0.5129575133323669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3535222113132477, + "epoch": 5.99, + "learning_rate": 2.0054945054945055e-05, + "loss": 0.4177, + "step": 7085, + "task_loss": 1.0047675371170044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20663830637931824, + "epoch": 5.99, + "learning_rate": 2.0050718512256975e-05, + "loss": 0.3591, + "step": 7086, + "task_loss": 0.7863340377807617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5445507168769836, + "epoch": 5.99, + "learning_rate": 2.004649196956889e-05, + "loss": 0.4676, + "step": 7087, + "task_loss": 0.558000385761261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23296119272708893, + "epoch": 5.99, + "learning_rate": 2.0042265426880814e-05, + "loss": 0.3041, + "step": 7088, + "task_loss": 0.3813418447971344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6151244044303894, + "epoch": 5.99, + "learning_rate": 2.003803888419273e-05, + "loss": 0.587, + "step": 7089, + "task_loss": 1.5875651836395264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45662006735801697, + "epoch": 5.99, + "learning_rate": 2.003381234150465e-05, + "loss": 0.3406, + "step": 7090, + "task_loss": 0.3505646884441376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5623189210891724, + "epoch": 5.99, + "learning_rate": 2.002958579881657e-05, + "loss": 0.4785, + "step": 7091, + "task_loss": 0.8563574552536011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5079002380371094, + "epoch": 5.99, + "learning_rate": 2.0025359256128487e-05, + "loss": 0.522, + "step": 7092, + "task_loss": 1.0612787008285522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5889732241630554, + "epoch": 6.0, + "learning_rate": 2.0021132713440406e-05, + "loss": 0.4594, + "step": 7093, + "task_loss": 0.4863797724246979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7369677424430847, + "epoch": 6.0, + "learning_rate": 2.0016906170752326e-05, + "loss": 0.5509, + "step": 7094, + "task_loss": 0.9107285737991333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41503313183784485, + "epoch": 6.0, + "learning_rate": 2.0012679628064242e-05, + "loss": 0.7397, + "step": 7095, + "task_loss": 0.2903255224227905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.340520977973938, + "epoch": 6.0, + "learning_rate": 2.0008453085376162e-05, + "loss": 0.4755, + "step": 7096, + "task_loss": 0.7630773186683655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48844391107559204, + "epoch": 6.0, + "learning_rate": 2.0004226542688082e-05, + "loss": 0.5623, + "step": 7097, + "task_loss": 0.7638906240463257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.343919038772583, + "epoch": 6.0, + "learning_rate": 2e-05, + "loss": 0.4544, + "step": 7098, + "task_loss": 0.7777069211006165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5701781511306763, + "epoch": 6.0, + "learning_rate": 1.999577345731192e-05, + "loss": 0.9276, + "step": 7099, + "task_loss": 0.3877008259296417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9569276571273804, + "epoch": 6.0, + "learning_rate": 1.9991546914623838e-05, + "loss": 0.509, + "step": 7100, + "task_loss": 0.7410070896148682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3484238386154175, + "epoch": 6.0, + "learning_rate": 1.9987320371935757e-05, + "loss": 0.4693, + "step": 7101, + "task_loss": 0.7160006165504456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35248348116874695, + "epoch": 6.0, + "learning_rate": 1.9983093829247677e-05, + "loss": 0.5155, + "step": 7102, + "task_loss": 0.5324273109436035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3718428313732147, + "epoch": 6.0, + "learning_rate": 1.9978867286559594e-05, + "loss": 0.4785, + "step": 7103, + "task_loss": 0.43350714445114136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4625117778778076, + "epoch": 6.01, + "learning_rate": 1.9974640743871513e-05, + "loss": 0.457, + "step": 7104, + "task_loss": 0.6774790287017822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.424936980009079, + "epoch": 6.01, + "learning_rate": 1.9970414201183433e-05, + "loss": 0.4229, + "step": 7105, + "task_loss": 0.3508889675140381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49180856347084045, + "epoch": 6.01, + "learning_rate": 1.9966187658495353e-05, + "loss": 0.4535, + "step": 7106, + "task_loss": 0.31365376710891724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7591346502304077, + "epoch": 6.01, + "learning_rate": 1.996196111580727e-05, + "loss": 0.5221, + "step": 7107, + "task_loss": 0.6466817259788513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37281787395477295, + "epoch": 6.01, + "learning_rate": 1.995773457311919e-05, + "loss": 0.4229, + "step": 7108, + "task_loss": 0.39326924085617065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5049030780792236, + "epoch": 6.01, + "learning_rate": 1.995350803043111e-05, + "loss": 0.4458, + "step": 7109, + "task_loss": 1.6908066272735596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4952152669429779, + "epoch": 6.01, + "learning_rate": 1.994928148774303e-05, + "loss": 0.4272, + "step": 7110, + "task_loss": 0.126212015748024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6992781162261963, + "epoch": 6.01, + "learning_rate": 1.9945054945054948e-05, + "loss": 0.6332, + "step": 7111, + "task_loss": 0.47483015060424805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31548428535461426, + "epoch": 6.01, + "learning_rate": 1.9940828402366864e-05, + "loss": 0.3622, + "step": 7112, + "task_loss": 0.16030895709991455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.785224437713623, + "epoch": 6.01, + "learning_rate": 1.9936601859678784e-05, + "loss": 0.5267, + "step": 7113, + "task_loss": 0.9471098780632019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3346899747848511, + "epoch": 6.01, + "learning_rate": 1.9932375316990704e-05, + "loss": 0.4288, + "step": 7114, + "task_loss": 0.4941166341304779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1805504709482193, + "epoch": 6.01, + "learning_rate": 1.992814877430262e-05, + "loss": 0.3375, + "step": 7115, + "task_loss": 0.04331701248884201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.506712019443512, + "epoch": 6.02, + "learning_rate": 1.992392223161454e-05, + "loss": 0.4503, + "step": 7116, + "task_loss": 0.24130764603614807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3008997440338135, + "epoch": 6.02, + "learning_rate": 1.991969568892646e-05, + "loss": 0.3175, + "step": 7117, + "task_loss": 0.533882200717926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28145119547843933, + "epoch": 6.02, + "learning_rate": 1.9915469146238376e-05, + "loss": 0.4303, + "step": 7118, + "task_loss": 0.15442383289337158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6824292540550232, + "epoch": 6.02, + "learning_rate": 1.99112426035503e-05, + "loss": 0.4483, + "step": 7119, + "task_loss": 0.5613414645195007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2743123769760132, + "epoch": 6.02, + "learning_rate": 1.9907016060862216e-05, + "loss": 0.3646, + "step": 7120, + "task_loss": 0.6803524494171143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4359835386276245, + "epoch": 6.02, + "learning_rate": 1.9902789518174135e-05, + "loss": 0.366, + "step": 7121, + "task_loss": 0.5511413216590881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5660382509231567, + "epoch": 6.02, + "learning_rate": 1.9898562975486055e-05, + "loss": 0.3876, + "step": 7122, + "task_loss": 0.9108102917671204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33020663261413574, + "epoch": 6.02, + "learning_rate": 1.989433643279797e-05, + "loss": 0.4916, + "step": 7123, + "task_loss": 0.6552597284317017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4297211766242981, + "epoch": 6.02, + "learning_rate": 1.989010989010989e-05, + "loss": 0.5881, + "step": 7124, + "task_loss": 0.6215572953224182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35129514336586, + "epoch": 6.02, + "learning_rate": 1.988588334742181e-05, + "loss": 0.3899, + "step": 7125, + "task_loss": 0.6472136974334717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.402021586894989, + "epoch": 6.02, + "learning_rate": 1.9881656804733727e-05, + "loss": 0.5175, + "step": 7126, + "task_loss": 0.9622039794921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3925040662288666, + "epoch": 6.02, + "learning_rate": 1.987743026204565e-05, + "loss": 0.3536, + "step": 7127, + "task_loss": 0.4665308892726898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5384204387664795, + "epoch": 6.03, + "learning_rate": 1.9873203719357567e-05, + "loss": 0.4926, + "step": 7128, + "task_loss": 0.5490915775299072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43364787101745605, + "epoch": 6.03, + "learning_rate": 1.9868977176669483e-05, + "loss": 0.6371, + "step": 7129, + "task_loss": 0.8415454030036926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5074772834777832, + "epoch": 6.03, + "learning_rate": 1.9864750633981406e-05, + "loss": 0.5866, + "step": 7130, + "task_loss": 0.6001825928688049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38212740421295166, + "epoch": 6.03, + "learning_rate": 1.9860524091293323e-05, + "loss": 0.632, + "step": 7131, + "task_loss": 0.3304152190685272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4040065109729767, + "epoch": 6.03, + "learning_rate": 1.985629754860524e-05, + "loss": 0.3547, + "step": 7132, + "task_loss": 0.477699339389801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20576882362365723, + "epoch": 6.03, + "learning_rate": 1.9852071005917162e-05, + "loss": 0.5589, + "step": 7133, + "task_loss": 0.5750070214271545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42077529430389404, + "epoch": 6.03, + "learning_rate": 1.984784446322908e-05, + "loss": 0.3648, + "step": 7134, + "task_loss": 0.692491888999939 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4043070077896118, + "epoch": 6.03, + "learning_rate": 1.9843617920540998e-05, + "loss": 0.3581, + "step": 7135, + "task_loss": 0.6107720136642456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4472653567790985, + "epoch": 6.03, + "learning_rate": 1.9839391377852918e-05, + "loss": 0.3986, + "step": 7136, + "task_loss": 0.48385530710220337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4865350127220154, + "epoch": 6.03, + "learning_rate": 1.9835164835164834e-05, + "loss": 0.4606, + "step": 7137, + "task_loss": 0.9772742986679077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.394976407289505, + "epoch": 6.03, + "learning_rate": 1.9830938292476757e-05, + "loss": 0.4254, + "step": 7138, + "task_loss": 0.48060402274131775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26181313395500183, + "epoch": 6.03, + "learning_rate": 1.9826711749788674e-05, + "loss": 0.2869, + "step": 7139, + "task_loss": 0.4717243015766144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3027476668357849, + "epoch": 6.04, + "learning_rate": 1.9822485207100593e-05, + "loss": 0.5421, + "step": 7140, + "task_loss": 0.33248940110206604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4299558997154236, + "epoch": 6.04, + "learning_rate": 1.9818258664412513e-05, + "loss": 0.4926, + "step": 7141, + "task_loss": 0.3326123058795929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5699581503868103, + "epoch": 6.04, + "learning_rate": 1.981403212172443e-05, + "loss": 0.4683, + "step": 7142, + "task_loss": 0.4445810317993164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6149921417236328, + "epoch": 6.04, + "learning_rate": 1.980980557903635e-05, + "loss": 0.4229, + "step": 7143, + "task_loss": 0.4773853123188019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34407907724380493, + "epoch": 6.04, + "learning_rate": 1.980557903634827e-05, + "loss": 0.3924, + "step": 7144, + "task_loss": 0.4509623944759369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6502318978309631, + "epoch": 6.04, + "learning_rate": 1.9801352493660185e-05, + "loss": 0.5104, + "step": 7145, + "task_loss": 1.2366260290145874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41682109236717224, + "epoch": 6.04, + "learning_rate": 1.9797125950972105e-05, + "loss": 0.5311, + "step": 7146, + "task_loss": 0.7743030786514282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37789708375930786, + "epoch": 6.04, + "learning_rate": 1.9792899408284025e-05, + "loss": 0.3977, + "step": 7147, + "task_loss": 0.9515600800514221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2106633186340332, + "epoch": 6.04, + "learning_rate": 1.9788672865595945e-05, + "loss": 0.3723, + "step": 7148, + "task_loss": 0.24695168435573578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4226333498954773, + "epoch": 6.04, + "learning_rate": 1.978444632290786e-05, + "loss": 0.4265, + "step": 7149, + "task_loss": 0.8468390107154846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41120079159736633, + "epoch": 6.04, + "learning_rate": 1.978021978021978e-05, + "loss": 0.4926, + "step": 7150, + "task_loss": 0.6580773591995239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2757922112941742, + "epoch": 6.04, + "learning_rate": 1.97759932375317e-05, + "loss": 0.3017, + "step": 7151, + "task_loss": 0.3530746102333069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5025150775909424, + "epoch": 6.05, + "learning_rate": 1.977176669484362e-05, + "loss": 0.4684, + "step": 7152, + "task_loss": 0.8743808269500732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.322287917137146, + "epoch": 6.05, + "learning_rate": 1.9767540152155536e-05, + "loss": 0.413, + "step": 7153, + "task_loss": 0.08523182570934296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28315162658691406, + "epoch": 6.05, + "learning_rate": 1.9763313609467456e-05, + "loss": 0.4788, + "step": 7154, + "task_loss": 0.17469990253448486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35658466815948486, + "epoch": 6.05, + "learning_rate": 1.9759087066779376e-05, + "loss": 0.4789, + "step": 7155, + "task_loss": 0.09968946129083633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5890517234802246, + "epoch": 6.05, + "learning_rate": 1.9754860524091296e-05, + "loss": 0.5579, + "step": 7156, + "task_loss": 0.9114635586738586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25344109535217285, + "epoch": 6.05, + "learning_rate": 1.9750633981403212e-05, + "loss": 0.4819, + "step": 7157, + "task_loss": 0.359023779630661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2757277190685272, + "epoch": 6.05, + "learning_rate": 1.9746407438715132e-05, + "loss": 0.3359, + "step": 7158, + "task_loss": 0.3692559599876404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49395227432250977, + "epoch": 6.05, + "learning_rate": 1.974218089602705e-05, + "loss": 0.3932, + "step": 7159, + "task_loss": 0.65919029712677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2909431755542755, + "epoch": 6.05, + "learning_rate": 1.9737954353338968e-05, + "loss": 0.5143, + "step": 7160, + "task_loss": 0.20295463502407074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39850038290023804, + "epoch": 6.05, + "learning_rate": 1.9733727810650888e-05, + "loss": 0.4272, + "step": 7161, + "task_loss": 0.7382453680038452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4145810604095459, + "epoch": 6.05, + "learning_rate": 1.9729501267962807e-05, + "loss": 0.4424, + "step": 7162, + "task_loss": 0.27712002396583557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3964119553565979, + "epoch": 6.05, + "learning_rate": 1.9725274725274727e-05, + "loss": 0.3528, + "step": 7163, + "task_loss": 0.3206097185611725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33977770805358887, + "epoch": 6.06, + "learning_rate": 1.9721048182586647e-05, + "loss": 0.4619, + "step": 7164, + "task_loss": 0.6170482039451599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6539547443389893, + "epoch": 6.06, + "learning_rate": 1.9716821639898563e-05, + "loss": 0.588, + "step": 7165, + "task_loss": 0.7581604719161987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43939924240112305, + "epoch": 6.06, + "learning_rate": 1.9712595097210483e-05, + "loss": 0.3468, + "step": 7166, + "task_loss": 0.10226503759622574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2961534559726715, + "epoch": 6.06, + "learning_rate": 1.9708368554522403e-05, + "loss": 0.5043, + "step": 7167, + "task_loss": 0.1020750179886818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36550116539001465, + "epoch": 6.06, + "learning_rate": 1.970414201183432e-05, + "loss": 0.386, + "step": 7168, + "task_loss": 0.14622335135936737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3229232430458069, + "epoch": 6.06, + "learning_rate": 1.9699915469146242e-05, + "loss": 0.4696, + "step": 7169, + "task_loss": 0.4402432441711426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3905911445617676, + "epoch": 6.06, + "learning_rate": 1.969568892645816e-05, + "loss": 0.4543, + "step": 7170, + "task_loss": 0.383543461561203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4533047676086426, + "epoch": 6.06, + "learning_rate": 1.9691462383770075e-05, + "loss": 0.4283, + "step": 7171, + "task_loss": 0.8107605576515198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3723984360694885, + "epoch": 6.06, + "learning_rate": 1.9687235841081998e-05, + "loss": 0.4786, + "step": 7172, + "task_loss": 0.16828909516334534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36962050199508667, + "epoch": 6.06, + "learning_rate": 1.9683009298393914e-05, + "loss": 0.6587, + "step": 7173, + "task_loss": 1.2329379320144653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29564645886421204, + "epoch": 6.06, + "learning_rate": 1.9678782755705834e-05, + "loss": 0.5681, + "step": 7174, + "task_loss": 0.7224622368812561 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.538479208946228, + "epoch": 6.07, + "learning_rate": 1.9674556213017754e-05, + "loss": 0.4559, + "step": 7175, + "task_loss": 0.6971434950828552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5118979215621948, + "epoch": 6.07, + "learning_rate": 1.967032967032967e-05, + "loss": 0.4474, + "step": 7176, + "task_loss": 0.14605680108070374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.322388231754303, + "epoch": 6.07, + "learning_rate": 1.966610312764159e-05, + "loss": 0.4133, + "step": 7177, + "task_loss": 0.06437411904335022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5000052452087402, + "epoch": 6.07, + "learning_rate": 1.966187658495351e-05, + "loss": 0.4551, + "step": 7178, + "task_loss": 0.3907923698425293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3304354250431061, + "epoch": 6.07, + "learning_rate": 1.9657650042265426e-05, + "loss": 0.3622, + "step": 7179, + "task_loss": 0.8238796591758728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5665068030357361, + "epoch": 6.07, + "learning_rate": 1.965342349957735e-05, + "loss": 0.5243, + "step": 7180, + "task_loss": 1.3886237144470215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.416795939207077, + "epoch": 6.07, + "learning_rate": 1.9649196956889265e-05, + "loss": 0.4367, + "step": 7181, + "task_loss": 0.9146501421928406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5946551561355591, + "epoch": 6.07, + "learning_rate": 1.9644970414201182e-05, + "loss": 0.4145, + "step": 7182, + "task_loss": 0.7932815551757812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24415510892868042, + "epoch": 6.07, + "learning_rate": 1.9640743871513105e-05, + "loss": 0.3678, + "step": 7183, + "task_loss": 0.05898415297269821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.664276123046875, + "epoch": 6.07, + "learning_rate": 1.963651732882502e-05, + "loss": 0.6169, + "step": 7184, + "task_loss": 0.16563449800014496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4112812280654907, + "epoch": 6.07, + "learning_rate": 1.963229078613694e-05, + "loss": 0.4604, + "step": 7185, + "task_loss": 0.7957295179367065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3325442969799042, + "epoch": 6.07, + "learning_rate": 1.962806424344886e-05, + "loss": 0.3323, + "step": 7186, + "task_loss": 0.45132017135620117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3425899147987366, + "epoch": 6.08, + "learning_rate": 1.9623837700760777e-05, + "loss": 0.4792, + "step": 7187, + "task_loss": 0.47369828820228577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5338979363441467, + "epoch": 6.08, + "learning_rate": 1.9619611158072697e-05, + "loss": 0.4862, + "step": 7188, + "task_loss": 1.300980806350708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37640833854675293, + "epoch": 6.08, + "learning_rate": 1.9615384615384617e-05, + "loss": 0.4266, + "step": 7189, + "task_loss": 0.8976707458496094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6532567739486694, + "epoch": 6.08, + "learning_rate": 1.9611158072696533e-05, + "loss": 0.4854, + "step": 7190, + "task_loss": 1.5067567825317383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3923265337944031, + "epoch": 6.08, + "learning_rate": 1.9606931530008456e-05, + "loss": 0.4297, + "step": 7191, + "task_loss": 0.5677223205566406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26764655113220215, + "epoch": 6.08, + "learning_rate": 1.9602704987320372e-05, + "loss": 0.3783, + "step": 7192, + "task_loss": 0.49393540620803833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6408248543739319, + "epoch": 6.08, + "learning_rate": 1.9598478444632292e-05, + "loss": 0.4839, + "step": 7193, + "task_loss": 1.2655560970306396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3643551170825958, + "epoch": 6.08, + "learning_rate": 1.9594251901944212e-05, + "loss": 0.4415, + "step": 7194, + "task_loss": 0.31174948811531067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4293809235095978, + "epoch": 6.08, + "learning_rate": 1.9590025359256128e-05, + "loss": 0.3371, + "step": 7195, + "task_loss": 0.3325020372867584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4124053120613098, + "epoch": 6.08, + "learning_rate": 1.9585798816568048e-05, + "loss": 0.41, + "step": 7196, + "task_loss": 1.1366801261901855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5854707956314087, + "epoch": 6.08, + "learning_rate": 1.9581572273879968e-05, + "loss": 0.518, + "step": 7197, + "task_loss": 0.6232240200042725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37119153141975403, + "epoch": 6.08, + "learning_rate": 1.9577345731191887e-05, + "loss": 0.3482, + "step": 7198, + "task_loss": 0.27896738052368164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3135519325733185, + "epoch": 6.09, + "learning_rate": 1.9573119188503804e-05, + "loss": 0.3737, + "step": 7199, + "task_loss": 0.7761308550834656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.269506573677063, + "epoch": 6.09, + "learning_rate": 1.9568892645815723e-05, + "loss": 0.4045, + "step": 7200, + "task_loss": 0.5865101218223572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4285007119178772, + "epoch": 6.09, + "learning_rate": 1.9564666103127643e-05, + "loss": 0.5723, + "step": 7201, + "task_loss": 0.15124337375164032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3308812081813812, + "epoch": 6.09, + "learning_rate": 1.956043956043956e-05, + "loss": 0.3671, + "step": 7202, + "task_loss": 0.13573087751865387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2836777865886688, + "epoch": 6.09, + "learning_rate": 1.955621301775148e-05, + "loss": 0.4015, + "step": 7203, + "task_loss": 0.7070236206054688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25491344928741455, + "epoch": 6.09, + "learning_rate": 1.95519864750634e-05, + "loss": 0.4813, + "step": 7204, + "task_loss": 0.8665040135383606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4460285007953644, + "epoch": 6.09, + "learning_rate": 1.954775993237532e-05, + "loss": 0.5672, + "step": 7205, + "task_loss": 0.36557960510253906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5708189010620117, + "epoch": 6.09, + "learning_rate": 1.954353338968724e-05, + "loss": 0.5532, + "step": 7206, + "task_loss": 1.1132572889328003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43596431612968445, + "epoch": 6.09, + "learning_rate": 1.9539306846999155e-05, + "loss": 0.4421, + "step": 7207, + "task_loss": 0.32812559604644775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2519352436065674, + "epoch": 6.09, + "learning_rate": 1.9535080304311075e-05, + "loss": 0.4791, + "step": 7208, + "task_loss": 0.7005215287208557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3390798568725586, + "epoch": 6.09, + "learning_rate": 1.9530853761622994e-05, + "loss": 0.3988, + "step": 7209, + "task_loss": 0.22908218204975128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3662128448486328, + "epoch": 6.09, + "learning_rate": 1.952662721893491e-05, + "loss": 0.3944, + "step": 7210, + "task_loss": 0.5346511006355286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5354434251785278, + "epoch": 6.1, + "learning_rate": 1.952240067624683e-05, + "loss": 0.5107, + "step": 7211, + "task_loss": 1.00706148147583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47801145911216736, + "epoch": 6.1, + "learning_rate": 1.951817413355875e-05, + "loss": 0.472, + "step": 7212, + "task_loss": 0.16307665407657623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3968490958213806, + "epoch": 6.1, + "learning_rate": 1.9513947590870667e-05, + "loss": 0.4674, + "step": 7213, + "task_loss": 0.4874959886074066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4203256368637085, + "epoch": 6.1, + "learning_rate": 1.950972104818259e-05, + "loss": 0.4924, + "step": 7214, + "task_loss": 0.3265974223613739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38115596771240234, + "epoch": 6.1, + "learning_rate": 1.9505494505494506e-05, + "loss": 0.4146, + "step": 7215, + "task_loss": 0.4111417531967163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45491865277290344, + "epoch": 6.1, + "learning_rate": 1.9501267962806426e-05, + "loss": 0.4446, + "step": 7216, + "task_loss": 0.6150575876235962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7031038999557495, + "epoch": 6.1, + "learning_rate": 1.9497041420118345e-05, + "loss": 0.5642, + "step": 7217, + "task_loss": 1.0282368659973145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7475574612617493, + "epoch": 6.1, + "learning_rate": 1.9492814877430262e-05, + "loss": 0.4913, + "step": 7218, + "task_loss": 0.6680750846862793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3313060998916626, + "epoch": 6.1, + "learning_rate": 1.948858833474218e-05, + "loss": 0.3971, + "step": 7219, + "task_loss": 1.079016923904419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4855874478816986, + "epoch": 6.1, + "learning_rate": 1.94843617920541e-05, + "loss": 0.4845, + "step": 7220, + "task_loss": 1.01728355884552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38026174902915955, + "epoch": 6.1, + "learning_rate": 1.9480135249366018e-05, + "loss": 0.4577, + "step": 7221, + "task_loss": 0.5155710577964783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3255561888217926, + "epoch": 6.1, + "learning_rate": 1.947590870667794e-05, + "loss": 0.4267, + "step": 7222, + "task_loss": 0.11263782531023026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4074364900588989, + "epoch": 6.11, + "learning_rate": 1.9471682163989857e-05, + "loss": 0.4735, + "step": 7223, + "task_loss": 0.5037665963172913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3519672453403473, + "epoch": 6.11, + "learning_rate": 1.9467455621301774e-05, + "loss": 0.5246, + "step": 7224, + "task_loss": 1.0137887001037598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4308398365974426, + "epoch": 6.11, + "learning_rate": 1.9463229078613697e-05, + "loss": 0.3713, + "step": 7225, + "task_loss": 0.8662633895874023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4749898314476013, + "epoch": 6.11, + "learning_rate": 1.9459002535925613e-05, + "loss": 0.5471, + "step": 7226, + "task_loss": 0.18816721439361572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5018928647041321, + "epoch": 6.11, + "learning_rate": 1.9454775993237533e-05, + "loss": 0.4716, + "step": 7227, + "task_loss": 0.34584859013557434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48448842763900757, + "epoch": 6.11, + "learning_rate": 1.9450549450549452e-05, + "loss": 0.4319, + "step": 7228, + "task_loss": 0.7106772065162659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.600232720375061, + "epoch": 6.11, + "learning_rate": 1.944632290786137e-05, + "loss": 0.4593, + "step": 7229, + "task_loss": 0.5822135210037231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33292192220687866, + "epoch": 6.11, + "learning_rate": 1.944209636517329e-05, + "loss": 0.3745, + "step": 7230, + "task_loss": 0.2658970355987549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3390076756477356, + "epoch": 6.11, + "learning_rate": 1.9437869822485208e-05, + "loss": 0.46, + "step": 7231, + "task_loss": 0.622743546962738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39441582560539246, + "epoch": 6.11, + "learning_rate": 1.9433643279797125e-05, + "loss": 0.3995, + "step": 7232, + "task_loss": 0.19118547439575195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36146610975265503, + "epoch": 6.11, + "learning_rate": 1.9429416737109048e-05, + "loss": 0.3749, + "step": 7233, + "task_loss": 0.6277105212211609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41977620124816895, + "epoch": 6.11, + "learning_rate": 1.9425190194420964e-05, + "loss": 0.4191, + "step": 7234, + "task_loss": 0.5643138289451599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6952922344207764, + "epoch": 6.12, + "learning_rate": 1.9420963651732884e-05, + "loss": 0.5151, + "step": 7235, + "task_loss": 0.4672245681285858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38035303354263306, + "epoch": 6.12, + "learning_rate": 1.9416737109044804e-05, + "loss": 0.4027, + "step": 7236, + "task_loss": 0.6459356546401978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7246323823928833, + "epoch": 6.12, + "learning_rate": 1.941251056635672e-05, + "loss": 0.5131, + "step": 7237, + "task_loss": 1.1017228364944458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2764829099178314, + "epoch": 6.12, + "learning_rate": 1.940828402366864e-05, + "loss": 0.3546, + "step": 7238, + "task_loss": 0.1499372273683548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45127758383750916, + "epoch": 6.12, + "learning_rate": 1.940405748098056e-05, + "loss": 0.4293, + "step": 7239, + "task_loss": 1.313875436782837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6153087019920349, + "epoch": 6.12, + "learning_rate": 1.9399830938292476e-05, + "loss": 0.5704, + "step": 7240, + "task_loss": 0.29242271184921265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43569034337997437, + "epoch": 6.12, + "learning_rate": 1.9395604395604396e-05, + "loss": 0.3923, + "step": 7241, + "task_loss": 0.5791569948196411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3548681437969208, + "epoch": 6.12, + "learning_rate": 1.9391377852916315e-05, + "loss": 0.4402, + "step": 7242, + "task_loss": 0.6347848176956177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2748337686061859, + "epoch": 6.12, + "learning_rate": 1.9387151310228235e-05, + "loss": 0.3238, + "step": 7243, + "task_loss": 0.5896656513214111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5457012057304382, + "epoch": 6.12, + "learning_rate": 1.9382924767540155e-05, + "loss": 0.4938, + "step": 7244, + "task_loss": 0.5273101329803467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.647644579410553, + "epoch": 6.12, + "learning_rate": 1.937869822485207e-05, + "loss": 0.5782, + "step": 7245, + "task_loss": 0.6769357323646545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30943426489830017, + "epoch": 6.13, + "learning_rate": 1.937447168216399e-05, + "loss": 0.4299, + "step": 7246, + "task_loss": 0.7686152458190918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43437010049819946, + "epoch": 6.13, + "learning_rate": 1.937024513947591e-05, + "loss": 0.4078, + "step": 7247, + "task_loss": 1.0504783391952515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5804103016853333, + "epoch": 6.13, + "learning_rate": 1.9366018596787827e-05, + "loss": 0.4329, + "step": 7248, + "task_loss": 0.7005054354667664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5804166793823242, + "epoch": 6.13, + "learning_rate": 1.9361792054099747e-05, + "loss": 0.3514, + "step": 7249, + "task_loss": 0.6372331380844116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4440772235393524, + "epoch": 6.13, + "learning_rate": 1.9357565511411666e-05, + "loss": 0.4097, + "step": 7250, + "task_loss": 0.7166685461997986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44156256318092346, + "epoch": 6.13, + "learning_rate": 1.9353338968723586e-05, + "loss": 0.3918, + "step": 7251, + "task_loss": 0.9399154186248779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5595995187759399, + "epoch": 6.13, + "learning_rate": 1.9349112426035502e-05, + "loss": 0.5349, + "step": 7252, + "task_loss": 0.22627943754196167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3984464406967163, + "epoch": 6.13, + "learning_rate": 1.9344885883347422e-05, + "loss": 0.4581, + "step": 7253, + "task_loss": 0.2679952085018158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3562823235988617, + "epoch": 6.13, + "learning_rate": 1.9340659340659342e-05, + "loss": 0.5453, + "step": 7254, + "task_loss": 0.6950831413269043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2506830096244812, + "epoch": 6.13, + "learning_rate": 1.933643279797126e-05, + "loss": 0.3891, + "step": 7255, + "task_loss": 0.5149086713790894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39791470766067505, + "epoch": 6.13, + "learning_rate": 1.933220625528318e-05, + "loss": 0.4162, + "step": 7256, + "task_loss": 0.4197749197483063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5846793055534363, + "epoch": 6.13, + "learning_rate": 1.9327979712595098e-05, + "loss": 0.381, + "step": 7257, + "task_loss": 0.8309436440467834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.458749920129776, + "epoch": 6.14, + "learning_rate": 1.9323753169907018e-05, + "loss": 0.6184, + "step": 7258, + "task_loss": 0.9189620614051819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24357575178146362, + "epoch": 6.14, + "learning_rate": 1.9319526627218937e-05, + "loss": 0.3191, + "step": 7259, + "task_loss": 0.3334467113018036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4442724883556366, + "epoch": 6.14, + "learning_rate": 1.9315300084530854e-05, + "loss": 0.4614, + "step": 7260, + "task_loss": 0.6521475315093994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42344123125076294, + "epoch": 6.14, + "learning_rate": 1.9311073541842773e-05, + "loss": 0.361, + "step": 7261, + "task_loss": 0.05495256185531616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24567323923110962, + "epoch": 6.14, + "learning_rate": 1.9306846999154693e-05, + "loss": 0.3256, + "step": 7262, + "task_loss": 0.022114817053079605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2565224766731262, + "epoch": 6.14, + "learning_rate": 1.930262045646661e-05, + "loss": 0.5277, + "step": 7263, + "task_loss": 0.6957934498786926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35377541184425354, + "epoch": 6.14, + "learning_rate": 1.9298393913778533e-05, + "loss": 0.4873, + "step": 7264, + "task_loss": 2.5892395973205566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4806590974330902, + "epoch": 6.14, + "learning_rate": 1.929416737109045e-05, + "loss": 0.4966, + "step": 7265, + "task_loss": 0.9655662775039673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6087069511413574, + "epoch": 6.14, + "learning_rate": 1.9289940828402365e-05, + "loss": 0.6755, + "step": 7266, + "task_loss": 0.4365273714065552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39462003111839294, + "epoch": 6.14, + "learning_rate": 1.928571428571429e-05, + "loss": 0.3276, + "step": 7267, + "task_loss": 0.9629340767860413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36402931809425354, + "epoch": 6.14, + "learning_rate": 1.9281487743026205e-05, + "loss": 0.4256, + "step": 7268, + "task_loss": 0.797833263874054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31495988368988037, + "epoch": 6.14, + "learning_rate": 1.9277261200338124e-05, + "loss": 0.4138, + "step": 7269, + "task_loss": 0.2734087407588959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9992058873176575, + "epoch": 6.15, + "learning_rate": 1.9273034657650044e-05, + "loss": 0.5954, + "step": 7270, + "task_loss": 0.3763977885246277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6159965991973877, + "epoch": 6.15, + "learning_rate": 1.926880811496196e-05, + "loss": 0.5735, + "step": 7271, + "task_loss": 0.38052043318748474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3040849566459656, + "epoch": 6.15, + "learning_rate": 1.9264581572273884e-05, + "loss": 0.3337, + "step": 7272, + "task_loss": 0.2077777087688446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7303988337516785, + "epoch": 6.15, + "learning_rate": 1.92603550295858e-05, + "loss": 0.4382, + "step": 7273, + "task_loss": 0.4692193865776062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3083440661430359, + "epoch": 6.15, + "learning_rate": 1.9256128486897716e-05, + "loss": 0.391, + "step": 7274, + "task_loss": 0.7587782144546509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44039517641067505, + "epoch": 6.15, + "learning_rate": 1.925190194420964e-05, + "loss": 0.5151, + "step": 7275, + "task_loss": 0.9947290420532227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7071822285652161, + "epoch": 6.15, + "learning_rate": 1.9247675401521556e-05, + "loss": 0.6124, + "step": 7276, + "task_loss": 1.531227707862854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3843895196914673, + "epoch": 6.15, + "learning_rate": 1.9243448858833472e-05, + "loss": 0.4671, + "step": 7277, + "task_loss": 0.23336240649223328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5710169076919556, + "epoch": 6.15, + "learning_rate": 1.9239222316145395e-05, + "loss": 0.4727, + "step": 7278, + "task_loss": 0.10760960727930069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31109243631362915, + "epoch": 6.15, + "learning_rate": 1.923499577345731e-05, + "loss": 0.4631, + "step": 7279, + "task_loss": 0.43082380294799805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36493390798568726, + "epoch": 6.15, + "learning_rate": 1.923076923076923e-05, + "loss": 0.3367, + "step": 7280, + "task_loss": 0.49687132239341736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41362395882606506, + "epoch": 6.15, + "learning_rate": 1.922654268808115e-05, + "loss": 0.4922, + "step": 7281, + "task_loss": 0.748630166053772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47343310713768005, + "epoch": 6.16, + "learning_rate": 1.9222316145393068e-05, + "loss": 0.5065, + "step": 7282, + "task_loss": 0.4317403733730316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3707965910434723, + "epoch": 6.16, + "learning_rate": 1.9218089602704987e-05, + "loss": 0.5196, + "step": 7283, + "task_loss": 0.6811929941177368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3478126525878906, + "epoch": 6.16, + "learning_rate": 1.9213863060016907e-05, + "loss": 0.2901, + "step": 7284, + "task_loss": 0.36637598276138306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31879478693008423, + "epoch": 6.16, + "learning_rate": 1.9209636517328827e-05, + "loss": 0.4351, + "step": 7285, + "task_loss": 0.34347429871559143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5941683053970337, + "epoch": 6.16, + "learning_rate": 1.9205409974640746e-05, + "loss": 0.5538, + "step": 7286, + "task_loss": 0.5448051691055298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34017014503479004, + "epoch": 6.16, + "learning_rate": 1.9201183431952663e-05, + "loss": 0.4121, + "step": 7287, + "task_loss": 0.09873310476541519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3984287977218628, + "epoch": 6.16, + "learning_rate": 1.9196956889264583e-05, + "loss": 0.3567, + "step": 7288, + "task_loss": 0.3622848093509674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35995280742645264, + "epoch": 6.16, + "learning_rate": 1.9192730346576502e-05, + "loss": 0.5538, + "step": 7289, + "task_loss": 1.3703365325927734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4665844738483429, + "epoch": 6.16, + "learning_rate": 1.918850380388842e-05, + "loss": 0.3651, + "step": 7290, + "task_loss": 0.4525938332080841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4479590058326721, + "epoch": 6.16, + "learning_rate": 1.918427726120034e-05, + "loss": 0.5299, + "step": 7291, + "task_loss": 0.2775273621082306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29000866413116455, + "epoch": 6.16, + "learning_rate": 1.9180050718512258e-05, + "loss": 0.4321, + "step": 7292, + "task_loss": 0.5615448355674744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6213821172714233, + "epoch": 6.16, + "learning_rate": 1.9175824175824178e-05, + "loss": 0.4756, + "step": 7293, + "task_loss": 0.6850985288619995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3296683132648468, + "epoch": 6.17, + "learning_rate": 1.9171597633136094e-05, + "loss": 0.3898, + "step": 7294, + "task_loss": 0.6386235952377319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30037155747413635, + "epoch": 6.17, + "learning_rate": 1.9167371090448014e-05, + "loss": 0.3686, + "step": 7295, + "task_loss": 0.15430592000484467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5931505560874939, + "epoch": 6.17, + "learning_rate": 1.9163144547759934e-05, + "loss": 0.5338, + "step": 7296, + "task_loss": 0.724937915802002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8483796715736389, + "epoch": 6.17, + "learning_rate": 1.9158918005071853e-05, + "loss": 0.5767, + "step": 7297, + "task_loss": 0.4056936502456665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6271671056747437, + "epoch": 6.17, + "learning_rate": 1.915469146238377e-05, + "loss": 0.3816, + "step": 7298, + "task_loss": 0.635051429271698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3000023066997528, + "epoch": 6.17, + "learning_rate": 1.915046491969569e-05, + "loss": 0.3515, + "step": 7299, + "task_loss": 0.8585831522941589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3297983705997467, + "epoch": 6.17, + "learning_rate": 1.914623837700761e-05, + "loss": 0.4136, + "step": 7300, + "task_loss": 0.7414449453353882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8679963946342468, + "epoch": 6.17, + "learning_rate": 1.914201183431953e-05, + "loss": 0.5139, + "step": 7301, + "task_loss": 1.1205123662948608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20349545776844025, + "epoch": 6.17, + "learning_rate": 1.9137785291631445e-05, + "loss": 0.3401, + "step": 7302, + "task_loss": 0.12631426751613617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3698154389858246, + "epoch": 6.17, + "learning_rate": 1.9133558748943365e-05, + "loss": 0.4391, + "step": 7303, + "task_loss": 0.3198661208152771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32010409235954285, + "epoch": 6.17, + "learning_rate": 1.9129332206255285e-05, + "loss": 0.4392, + "step": 7304, + "task_loss": 0.6103374361991882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41391831636428833, + "epoch": 6.17, + "learning_rate": 1.91251056635672e-05, + "loss": 0.566, + "step": 7305, + "task_loss": 1.4587033987045288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2899538278579712, + "epoch": 6.18, + "learning_rate": 1.912087912087912e-05, + "loss": 0.3796, + "step": 7306, + "task_loss": 0.05296706408262253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46851879358291626, + "epoch": 6.18, + "learning_rate": 1.911665257819104e-05, + "loss": 0.4386, + "step": 7307, + "task_loss": 0.16368341445922852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7770696878433228, + "epoch": 6.18, + "learning_rate": 1.911242603550296e-05, + "loss": 0.4877, + "step": 7308, + "task_loss": 1.6495201587677002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.466976523399353, + "epoch": 6.18, + "learning_rate": 1.910819949281488e-05, + "loss": 0.496, + "step": 7309, + "task_loss": 0.8720992803573608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25297194719314575, + "epoch": 6.18, + "learning_rate": 1.9103972950126796e-05, + "loss": 0.424, + "step": 7310, + "task_loss": 0.2618497610092163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5290127396583557, + "epoch": 6.18, + "learning_rate": 1.9099746407438716e-05, + "loss": 0.4765, + "step": 7311, + "task_loss": 0.5330207943916321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3235194683074951, + "epoch": 6.18, + "learning_rate": 1.9095519864750636e-05, + "loss": 0.4227, + "step": 7312, + "task_loss": 0.39497196674346924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3416614234447479, + "epoch": 6.18, + "learning_rate": 1.9091293322062552e-05, + "loss": 0.4681, + "step": 7313, + "task_loss": 0.3449939489364624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3195849061012268, + "epoch": 6.18, + "learning_rate": 1.9087066779374475e-05, + "loss": 0.3835, + "step": 7314, + "task_loss": 0.22382360696792603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31678280234336853, + "epoch": 6.18, + "learning_rate": 1.9082840236686392e-05, + "loss": 0.3901, + "step": 7315, + "task_loss": 0.738714873790741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3156195282936096, + "epoch": 6.18, + "learning_rate": 1.9078613693998308e-05, + "loss": 0.4359, + "step": 7316, + "task_loss": 0.4473409354686737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44515860080718994, + "epoch": 6.19, + "learning_rate": 1.907438715131023e-05, + "loss": 0.5025, + "step": 7317, + "task_loss": 0.3482765257358551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23016749322414398, + "epoch": 6.19, + "learning_rate": 1.9070160608622148e-05, + "loss": 0.3331, + "step": 7318, + "task_loss": 0.5439857244491577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33232271671295166, + "epoch": 6.19, + "learning_rate": 1.9065934065934067e-05, + "loss": 0.3679, + "step": 7319, + "task_loss": 0.6494312882423401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47448423504829407, + "epoch": 6.19, + "learning_rate": 1.9061707523245987e-05, + "loss": 0.4674, + "step": 7320, + "task_loss": 1.3593417406082153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5232985615730286, + "epoch": 6.19, + "learning_rate": 1.9057480980557903e-05, + "loss": 0.5416, + "step": 7321, + "task_loss": 1.7466336488723755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43863600492477417, + "epoch": 6.19, + "learning_rate": 1.9053254437869823e-05, + "loss": 0.4628, + "step": 7322, + "task_loss": 0.8731913566589355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3592334985733032, + "epoch": 6.19, + "learning_rate": 1.9049027895181743e-05, + "loss": 0.424, + "step": 7323, + "task_loss": 0.751307487487793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5468201041221619, + "epoch": 6.19, + "learning_rate": 1.904480135249366e-05, + "loss": 0.4954, + "step": 7324, + "task_loss": 0.4092971086502075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45807576179504395, + "epoch": 6.19, + "learning_rate": 1.9040574809805582e-05, + "loss": 0.4143, + "step": 7325, + "task_loss": 0.6595118045806885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.235312819480896, + "epoch": 6.19, + "learning_rate": 1.90363482671175e-05, + "loss": 0.4287, + "step": 7326, + "task_loss": 0.14099450409412384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24788406491279602, + "epoch": 6.19, + "learning_rate": 1.9032121724429415e-05, + "loss": 0.4505, + "step": 7327, + "task_loss": 0.16480973362922668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19721949100494385, + "epoch": 6.19, + "learning_rate": 1.9027895181741338e-05, + "loss": 0.3314, + "step": 7328, + "task_loss": 0.5221731662750244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20585167407989502, + "epoch": 6.2, + "learning_rate": 1.9023668639053255e-05, + "loss": 0.3875, + "step": 7329, + "task_loss": 0.7031477689743042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4062773883342743, + "epoch": 6.2, + "learning_rate": 1.9019442096365174e-05, + "loss": 0.4376, + "step": 7330, + "task_loss": 0.308798223733902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32946330308914185, + "epoch": 6.2, + "learning_rate": 1.9015215553677094e-05, + "loss": 0.4217, + "step": 7331, + "task_loss": 0.3941155970096588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7146567106246948, + "epoch": 6.2, + "learning_rate": 1.901098901098901e-05, + "loss": 0.6376, + "step": 7332, + "task_loss": 0.8359385132789612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2814042866230011, + "epoch": 6.2, + "learning_rate": 1.900676246830093e-05, + "loss": 0.3658, + "step": 7333, + "task_loss": 0.8458582758903503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31023794412612915, + "epoch": 6.2, + "learning_rate": 1.900253592561285e-05, + "loss": 0.3802, + "step": 7334, + "task_loss": 0.7970231175422668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3722670078277588, + "epoch": 6.2, + "learning_rate": 1.8998309382924766e-05, + "loss": 0.3897, + "step": 7335, + "task_loss": 0.8856242895126343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2682872414588928, + "epoch": 6.2, + "learning_rate": 1.899408284023669e-05, + "loss": 0.3356, + "step": 7336, + "task_loss": 0.044519949704408646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3589381277561188, + "epoch": 6.2, + "learning_rate": 1.8989856297548606e-05, + "loss": 0.4341, + "step": 7337, + "task_loss": 0.410133421421051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6460956335067749, + "epoch": 6.2, + "learning_rate": 1.8985629754860525e-05, + "loss": 0.5067, + "step": 7338, + "task_loss": 0.7695217728614807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22184737026691437, + "epoch": 6.2, + "learning_rate": 1.8981403212172445e-05, + "loss": 0.3334, + "step": 7339, + "task_loss": 0.5770419836044312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22460484504699707, + "epoch": 6.2, + "learning_rate": 1.897717666948436e-05, + "loss": 0.4166, + "step": 7340, + "task_loss": 0.26320019364356995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27416718006134033, + "epoch": 6.21, + "learning_rate": 1.897295012679628e-05, + "loss": 0.4181, + "step": 7341, + "task_loss": 0.42398548126220703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37002819776535034, + "epoch": 6.21, + "learning_rate": 1.89687235841082e-05, + "loss": 0.3635, + "step": 7342, + "task_loss": 0.6759760975837708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2989233732223511, + "epoch": 6.21, + "learning_rate": 1.896449704142012e-05, + "loss": 0.3555, + "step": 7343, + "task_loss": 0.39603909850120544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.571068286895752, + "epoch": 6.21, + "learning_rate": 1.8960270498732037e-05, + "loss": 0.4749, + "step": 7344, + "task_loss": 0.3604673445224762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6240682601928711, + "epoch": 6.21, + "learning_rate": 1.8956043956043957e-05, + "loss": 0.4988, + "step": 7345, + "task_loss": 0.8339646458625793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27595847845077515, + "epoch": 6.21, + "learning_rate": 1.8951817413355877e-05, + "loss": 0.4719, + "step": 7346, + "task_loss": 0.2777312397956848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6711755990982056, + "epoch": 6.21, + "learning_rate": 1.8947590870667793e-05, + "loss": 0.4439, + "step": 7347, + "task_loss": 0.9724555015563965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3729579448699951, + "epoch": 6.21, + "learning_rate": 1.8943364327979713e-05, + "loss": 0.429, + "step": 7348, + "task_loss": 0.1850886195898056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.679522693157196, + "epoch": 6.21, + "learning_rate": 1.8939137785291632e-05, + "loss": 0.6091, + "step": 7349, + "task_loss": 1.6236156225204468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20857428014278412, + "epoch": 6.21, + "learning_rate": 1.8934911242603552e-05, + "loss": 0.3602, + "step": 7350, + "task_loss": 0.10442803055047989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3147681951522827, + "epoch": 6.21, + "learning_rate": 1.8930684699915472e-05, + "loss": 0.4358, + "step": 7351, + "task_loss": 0.9983011484146118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.566206693649292, + "epoch": 6.21, + "learning_rate": 1.8926458157227388e-05, + "loss": 0.6135, + "step": 7352, + "task_loss": 0.8196436762809753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6755408644676208, + "epoch": 6.22, + "learning_rate": 1.8922231614539308e-05, + "loss": 0.6164, + "step": 7353, + "task_loss": 1.0270711183547974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7381584644317627, + "epoch": 6.22, + "learning_rate": 1.8918005071851228e-05, + "loss": 0.5472, + "step": 7354, + "task_loss": 1.1247092485427856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6968719959259033, + "epoch": 6.22, + "learning_rate": 1.8913778529163144e-05, + "loss": 0.4274, + "step": 7355, + "task_loss": 0.5901756286621094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.352206826210022, + "epoch": 6.22, + "learning_rate": 1.8909551986475064e-05, + "loss": 0.4541, + "step": 7356, + "task_loss": 0.8579646944999695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42863553762435913, + "epoch": 6.22, + "learning_rate": 1.8905325443786984e-05, + "loss": 0.3989, + "step": 7357, + "task_loss": 0.27568715810775757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5466806888580322, + "epoch": 6.22, + "learning_rate": 1.89010989010989e-05, + "loss": 0.4663, + "step": 7358, + "task_loss": 0.26437240839004517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30731597542762756, + "epoch": 6.22, + "learning_rate": 1.8896872358410823e-05, + "loss": 0.3161, + "step": 7359, + "task_loss": 1.0283024311065674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39132875204086304, + "epoch": 6.22, + "learning_rate": 1.889264581572274e-05, + "loss": 0.3396, + "step": 7360, + "task_loss": 0.43007567524909973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5204179286956787, + "epoch": 6.22, + "learning_rate": 1.888841927303466e-05, + "loss": 0.4599, + "step": 7361, + "task_loss": 0.6762171387672424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40995627641677856, + "epoch": 6.22, + "learning_rate": 1.888419273034658e-05, + "loss": 0.3663, + "step": 7362, + "task_loss": 0.5207911729812622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19905410706996918, + "epoch": 6.22, + "learning_rate": 1.8879966187658495e-05, + "loss": 0.4125, + "step": 7363, + "task_loss": 0.577556848526001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7188290357589722, + "epoch": 6.22, + "learning_rate": 1.8875739644970415e-05, + "loss": 0.5165, + "step": 7364, + "task_loss": 0.7827625274658203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5546907186508179, + "epoch": 6.23, + "learning_rate": 1.8871513102282335e-05, + "loss": 0.5043, + "step": 7365, + "task_loss": 0.6529926657676697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14544931054115295, + "epoch": 6.23, + "learning_rate": 1.886728655959425e-05, + "loss": 0.4328, + "step": 7366, + "task_loss": 0.0627792701125145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39158856868743896, + "epoch": 6.23, + "learning_rate": 1.8863060016906174e-05, + "loss": 0.4291, + "step": 7367, + "task_loss": 0.7068882584571838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2912377715110779, + "epoch": 6.23, + "learning_rate": 1.885883347421809e-05, + "loss": 0.3664, + "step": 7368, + "task_loss": 0.29457083344459534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3838551342487335, + "epoch": 6.23, + "learning_rate": 1.8854606931530007e-05, + "loss": 0.4302, + "step": 7369, + "task_loss": 0.605881929397583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5682333707809448, + "epoch": 6.23, + "learning_rate": 1.885038038884193e-05, + "loss": 0.4513, + "step": 7370, + "task_loss": 0.5042519569396973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6683791875839233, + "epoch": 6.23, + "learning_rate": 1.8846153846153846e-05, + "loss": 0.5397, + "step": 7371, + "task_loss": 0.6774916648864746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4389325976371765, + "epoch": 6.23, + "learning_rate": 1.8841927303465766e-05, + "loss": 0.4626, + "step": 7372, + "task_loss": 0.27493783831596375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8418166637420654, + "epoch": 6.23, + "learning_rate": 1.8837700760777686e-05, + "loss": 0.4297, + "step": 7373, + "task_loss": 1.1720597743988037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5171328783035278, + "epoch": 6.23, + "learning_rate": 1.8833474218089602e-05, + "loss": 0.4441, + "step": 7374, + "task_loss": 0.5749563574790955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4629538059234619, + "epoch": 6.23, + "learning_rate": 1.8829247675401522e-05, + "loss": 0.3852, + "step": 7375, + "task_loss": 0.6483286619186401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4664277136325836, + "epoch": 6.23, + "learning_rate": 1.882502113271344e-05, + "loss": 0.5763, + "step": 7376, + "task_loss": 0.945827066898346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49366310238838196, + "epoch": 6.24, + "learning_rate": 1.8820794590025358e-05, + "loss": 0.4849, + "step": 7377, + "task_loss": 0.6275193095207214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3395857810974121, + "epoch": 6.24, + "learning_rate": 1.881656804733728e-05, + "loss": 0.5228, + "step": 7378, + "task_loss": 0.7266091704368591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3572564721107483, + "epoch": 6.24, + "learning_rate": 1.8812341504649197e-05, + "loss": 0.5073, + "step": 7379, + "task_loss": 0.5705186724662781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28859540820121765, + "epoch": 6.24, + "learning_rate": 1.8808114961961117e-05, + "loss": 0.3876, + "step": 7380, + "task_loss": 0.5783632397651672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.381360799074173, + "epoch": 6.24, + "learning_rate": 1.8803888419273037e-05, + "loss": 0.4359, + "step": 7381, + "task_loss": 1.1658483743667603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5316799879074097, + "epoch": 6.24, + "learning_rate": 1.8799661876584953e-05, + "loss": 0.5222, + "step": 7382, + "task_loss": 0.44763651490211487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4617660641670227, + "epoch": 6.24, + "learning_rate": 1.8795435333896873e-05, + "loss": 0.4709, + "step": 7383, + "task_loss": 1.060294508934021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23650670051574707, + "epoch": 6.24, + "learning_rate": 1.8791208791208793e-05, + "loss": 0.397, + "step": 7384, + "task_loss": 0.11781829595565796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4250749945640564, + "epoch": 6.24, + "learning_rate": 1.878698224852071e-05, + "loss": 0.3795, + "step": 7385, + "task_loss": 0.5753234028816223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6258448362350464, + "epoch": 6.24, + "learning_rate": 1.878275570583263e-05, + "loss": 0.4174, + "step": 7386, + "task_loss": 1.0588935613632202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.575982928276062, + "epoch": 6.24, + "learning_rate": 1.877852916314455e-05, + "loss": 0.5323, + "step": 7387, + "task_loss": 0.9774614572525024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5572336912155151, + "epoch": 6.24, + "learning_rate": 1.877430262045647e-05, + "loss": 0.5596, + "step": 7388, + "task_loss": 0.24310147762298584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35478606820106506, + "epoch": 6.25, + "learning_rate": 1.8770076077768388e-05, + "loss": 0.4239, + "step": 7389, + "task_loss": 0.7384392023086548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3330371379852295, + "epoch": 6.25, + "learning_rate": 1.8765849535080304e-05, + "loss": 0.3789, + "step": 7390, + "task_loss": 0.21040931344032288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6635715365409851, + "epoch": 6.25, + "learning_rate": 1.8761622992392224e-05, + "loss": 0.4656, + "step": 7391, + "task_loss": 0.5885238647460938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.500828206539154, + "epoch": 6.25, + "learning_rate": 1.8757396449704144e-05, + "loss": 0.3904, + "step": 7392, + "task_loss": 0.7408778071403503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5223671197891235, + "epoch": 6.25, + "learning_rate": 1.875316990701606e-05, + "loss": 0.5171, + "step": 7393, + "task_loss": 1.0610510110855103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18094712495803833, + "epoch": 6.25, + "learning_rate": 1.874894336432798e-05, + "loss": 0.3785, + "step": 7394, + "task_loss": 0.6333187818527222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6549341678619385, + "epoch": 6.25, + "learning_rate": 1.87447168216399e-05, + "loss": 0.4936, + "step": 7395, + "task_loss": 0.7159588932991028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2940424680709839, + "epoch": 6.25, + "learning_rate": 1.874049027895182e-05, + "loss": 0.3755, + "step": 7396, + "task_loss": 0.160593181848526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4882940649986267, + "epoch": 6.25, + "learning_rate": 1.8736263736263736e-05, + "loss": 0.4114, + "step": 7397, + "task_loss": 0.9935579895973206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.528218686580658, + "epoch": 6.25, + "learning_rate": 1.8732037193575656e-05, + "loss": 0.4726, + "step": 7398, + "task_loss": 0.09689510613679886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6261005401611328, + "epoch": 6.25, + "learning_rate": 1.8727810650887575e-05, + "loss": 0.4897, + "step": 7399, + "task_loss": 1.3823537826538086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29487594962120056, + "epoch": 6.26, + "learning_rate": 1.8723584108199495e-05, + "loss": 0.3328, + "step": 7400, + "task_loss": 0.7492702603340149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4351596236228943, + "epoch": 6.26, + "learning_rate": 1.871935756551141e-05, + "loss": 0.4276, + "step": 7401, + "task_loss": 0.36651086807250977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5562874674797058, + "epoch": 6.26, + "learning_rate": 1.871513102282333e-05, + "loss": 0.3936, + "step": 7402, + "task_loss": 0.32533732056617737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2341289520263672, + "epoch": 6.26, + "learning_rate": 1.871090448013525e-05, + "loss": 0.3087, + "step": 7403, + "task_loss": 0.3589152693748474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46211016178131104, + "epoch": 6.26, + "learning_rate": 1.870667793744717e-05, + "loss": 0.4089, + "step": 7404, + "task_loss": 0.07596525549888611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31350022554397583, + "epoch": 6.26, + "learning_rate": 1.8702451394759087e-05, + "loss": 0.4579, + "step": 7405, + "task_loss": 0.736756443977356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3533334732055664, + "epoch": 6.26, + "learning_rate": 1.8698224852071007e-05, + "loss": 0.461, + "step": 7406, + "task_loss": 0.7795369625091553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36756011843681335, + "epoch": 6.26, + "learning_rate": 1.8693998309382926e-05, + "loss": 0.3735, + "step": 7407, + "task_loss": 0.20933058857917786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43795180320739746, + "epoch": 6.26, + "learning_rate": 1.8689771766694843e-05, + "loss": 0.5907, + "step": 7408, + "task_loss": 0.47002172470092773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5881123542785645, + "epoch": 6.26, + "learning_rate": 1.8685545224006766e-05, + "loss": 0.4426, + "step": 7409, + "task_loss": 0.9388710260391235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5761983394622803, + "epoch": 6.26, + "learning_rate": 1.8681318681318682e-05, + "loss": 0.5937, + "step": 7410, + "task_loss": 0.7771567106246948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4625416398048401, + "epoch": 6.26, + "learning_rate": 1.86770921386306e-05, + "loss": 0.3953, + "step": 7411, + "task_loss": 0.5007261633872986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25369852781295776, + "epoch": 6.27, + "learning_rate": 1.8672865595942522e-05, + "loss": 0.3196, + "step": 7412, + "task_loss": 0.7372127771377563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39217662811279297, + "epoch": 6.27, + "learning_rate": 1.8668639053254438e-05, + "loss": 0.567, + "step": 7413, + "task_loss": 0.9854080080986023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28860050439834595, + "epoch": 6.27, + "learning_rate": 1.8664412510566358e-05, + "loss": 0.3232, + "step": 7414, + "task_loss": 0.4750038981437683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4465206563472748, + "epoch": 6.27, + "learning_rate": 1.8660185967878278e-05, + "loss": 0.4259, + "step": 7415, + "task_loss": 0.5471699833869934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6128970384597778, + "epoch": 6.27, + "learning_rate": 1.8655959425190194e-05, + "loss": 0.5968, + "step": 7416, + "task_loss": 1.4372280836105347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7934972047805786, + "epoch": 6.27, + "learning_rate": 1.8651732882502117e-05, + "loss": 0.5329, + "step": 7417, + "task_loss": 0.6570456027984619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30409589409828186, + "epoch": 6.27, + "learning_rate": 1.8647506339814033e-05, + "loss": 0.3209, + "step": 7418, + "task_loss": 0.16773465275764465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25856998562812805, + "epoch": 6.27, + "learning_rate": 1.864327979712595e-05, + "loss": 0.3579, + "step": 7419, + "task_loss": 1.1813669204711914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6655339002609253, + "epoch": 6.27, + "learning_rate": 1.8639053254437873e-05, + "loss": 0.6131, + "step": 7420, + "task_loss": 1.254402995109558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36151522397994995, + "epoch": 6.27, + "learning_rate": 1.863482671174979e-05, + "loss": 0.4679, + "step": 7421, + "task_loss": 0.874910295009613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4015306830406189, + "epoch": 6.27, + "learning_rate": 1.8630600169061706e-05, + "loss": 0.4121, + "step": 7422, + "task_loss": 0.4424337148666382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4657759368419647, + "epoch": 6.27, + "learning_rate": 1.862637362637363e-05, + "loss": 0.2997, + "step": 7423, + "task_loss": 0.4597998857498169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.263441801071167, + "epoch": 6.28, + "learning_rate": 1.8622147083685545e-05, + "loss": 0.3745, + "step": 7424, + "task_loss": 0.435884565114975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3490613102912903, + "epoch": 6.28, + "learning_rate": 1.8617920540997465e-05, + "loss": 0.4627, + "step": 7425, + "task_loss": 1.2124418020248413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25691765546798706, + "epoch": 6.28, + "learning_rate": 1.8613693998309385e-05, + "loss": 0.3247, + "step": 7426, + "task_loss": 0.3420126736164093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5784562826156616, + "epoch": 6.28, + "learning_rate": 1.86094674556213e-05, + "loss": 0.4764, + "step": 7427, + "task_loss": 0.6915899515151978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3411710262298584, + "epoch": 6.28, + "learning_rate": 1.860524091293322e-05, + "loss": 0.458, + "step": 7428, + "task_loss": 0.3122914731502533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3249645233154297, + "epoch": 6.28, + "learning_rate": 1.860101437024514e-05, + "loss": 0.3601, + "step": 7429, + "task_loss": 0.09959448873996735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22226080298423767, + "epoch": 6.28, + "learning_rate": 1.8596787827557057e-05, + "loss": 0.3633, + "step": 7430, + "task_loss": 0.3487250506877899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6091804504394531, + "epoch": 6.28, + "learning_rate": 1.859256128486898e-05, + "loss": 0.4972, + "step": 7431, + "task_loss": 0.4621986448764801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48916590213775635, + "epoch": 6.28, + "learning_rate": 1.8588334742180896e-05, + "loss": 0.4447, + "step": 7432, + "task_loss": 0.557756781578064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1964636594057083, + "epoch": 6.28, + "learning_rate": 1.8584108199492816e-05, + "loss": 0.3794, + "step": 7433, + "task_loss": 0.3018451929092407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7813000679016113, + "epoch": 6.28, + "learning_rate": 1.8579881656804736e-05, + "loss": 0.5741, + "step": 7434, + "task_loss": 0.18632301688194275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27533307671546936, + "epoch": 6.28, + "learning_rate": 1.8575655114116652e-05, + "loss": 0.4389, + "step": 7435, + "task_loss": 0.09850893169641495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20819804072380066, + "epoch": 6.29, + "learning_rate": 1.8571428571428572e-05, + "loss": 0.3719, + "step": 7436, + "task_loss": 0.022275356575846672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5539222955703735, + "epoch": 6.29, + "learning_rate": 1.856720202874049e-05, + "loss": 0.5492, + "step": 7437, + "task_loss": 0.5539168119430542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5495555400848389, + "epoch": 6.29, + "learning_rate": 1.856297548605241e-05, + "loss": 0.5626, + "step": 7438, + "task_loss": 0.12057632207870483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3981839418411255, + "epoch": 6.29, + "learning_rate": 1.8558748943364328e-05, + "loss": 0.4617, + "step": 7439, + "task_loss": 0.33110612630844116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6272201538085938, + "epoch": 6.29, + "learning_rate": 1.8554522400676247e-05, + "loss": 0.4235, + "step": 7440, + "task_loss": 1.143898606300354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37437906861305237, + "epoch": 6.29, + "learning_rate": 1.8550295857988167e-05, + "loss": 0.408, + "step": 7441, + "task_loss": 0.6268939971923828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4869462847709656, + "epoch": 6.29, + "learning_rate": 1.8546069315300087e-05, + "loss": 0.4792, + "step": 7442, + "task_loss": 1.1253035068511963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5036808252334595, + "epoch": 6.29, + "learning_rate": 1.8541842772612003e-05, + "loss": 0.5781, + "step": 7443, + "task_loss": 1.171592116355896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4199615716934204, + "epoch": 6.29, + "learning_rate": 1.8537616229923923e-05, + "loss": 0.4245, + "step": 7444, + "task_loss": 0.049844495952129364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.161430224776268, + "epoch": 6.29, + "learning_rate": 1.8533389687235843e-05, + "loss": 0.4761, + "step": 7445, + "task_loss": 0.11534826457500458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4419282376766205, + "epoch": 6.29, + "learning_rate": 1.8529163144547762e-05, + "loss": 0.368, + "step": 7446, + "task_loss": 0.25295397639274597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43667957186698914, + "epoch": 6.29, + "learning_rate": 1.852493660185968e-05, + "loss": 0.4716, + "step": 7447, + "task_loss": 0.23077793419361115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4362657070159912, + "epoch": 6.3, + "learning_rate": 1.85207100591716e-05, + "loss": 0.4857, + "step": 7448, + "task_loss": 0.7477615475654602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5921239852905273, + "epoch": 6.3, + "learning_rate": 1.8516483516483518e-05, + "loss": 0.4881, + "step": 7449, + "task_loss": 0.7867098450660706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4152717590332031, + "epoch": 6.3, + "learning_rate": 1.8512256973795435e-05, + "loss": 0.3956, + "step": 7450, + "task_loss": 0.6226378083229065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45414963364601135, + "epoch": 6.3, + "learning_rate": 1.8508030431107354e-05, + "loss": 0.4763, + "step": 7451, + "task_loss": 0.8678309917449951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3037633001804352, + "epoch": 6.3, + "learning_rate": 1.8503803888419274e-05, + "loss": 0.4174, + "step": 7452, + "task_loss": 0.057907044887542725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3846437335014343, + "epoch": 6.3, + "learning_rate": 1.8499577345731194e-05, + "loss": 0.3531, + "step": 7453, + "task_loss": 0.5084137916564941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47334036231040955, + "epoch": 6.3, + "learning_rate": 1.8495350803043113e-05, + "loss": 0.3884, + "step": 7454, + "task_loss": 0.6242532730102539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.538460910320282, + "epoch": 6.3, + "learning_rate": 1.849112426035503e-05, + "loss": 0.4844, + "step": 7455, + "task_loss": 0.9625957012176514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3545105457305908, + "epoch": 6.3, + "learning_rate": 1.848689771766695e-05, + "loss": 0.4091, + "step": 7456, + "task_loss": 0.3145792484283447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5496225953102112, + "epoch": 6.3, + "learning_rate": 1.848267117497887e-05, + "loss": 0.4465, + "step": 7457, + "task_loss": 0.9730275273323059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4006803631782532, + "epoch": 6.3, + "learning_rate": 1.8478444632290786e-05, + "loss": 0.5432, + "step": 7458, + "task_loss": 0.5437538623809814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3841736912727356, + "epoch": 6.3, + "learning_rate": 1.8474218089602705e-05, + "loss": 0.4372, + "step": 7459, + "task_loss": 0.7839620113372803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4889374077320099, + "epoch": 6.31, + "learning_rate": 1.8469991546914625e-05, + "loss": 0.37, + "step": 7460, + "task_loss": 0.6489946246147156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23489783704280853, + "epoch": 6.31, + "learning_rate": 1.846576500422654e-05, + "loss": 0.3055, + "step": 7461, + "task_loss": 0.5485289096832275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39727866649627686, + "epoch": 6.31, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.3871, + "step": 7462, + "task_loss": 0.44308724999427795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.209807351231575, + "epoch": 6.31, + "learning_rate": 1.845731191885038e-05, + "loss": 0.4402, + "step": 7463, + "task_loss": 0.09618355333805084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2060011327266693, + "epoch": 6.31, + "learning_rate": 1.84530853761623e-05, + "loss": 0.3076, + "step": 7464, + "task_loss": 0.6820746660232544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2552296221256256, + "epoch": 6.31, + "learning_rate": 1.844885883347422e-05, + "loss": 0.3227, + "step": 7465, + "task_loss": 0.45945101976394653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28092581033706665, + "epoch": 6.31, + "learning_rate": 1.8444632290786137e-05, + "loss": 0.4415, + "step": 7466, + "task_loss": 0.6931418180465698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38539350032806396, + "epoch": 6.31, + "learning_rate": 1.8440405748098057e-05, + "loss": 0.4929, + "step": 7467, + "task_loss": 0.7239944934844971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4543333947658539, + "epoch": 6.31, + "learning_rate": 1.8436179205409976e-05, + "loss": 0.3338, + "step": 7468, + "task_loss": 0.6084698438644409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31763705611228943, + "epoch": 6.31, + "learning_rate": 1.8431952662721893e-05, + "loss": 0.4451, + "step": 7469, + "task_loss": 0.12259476631879807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3203537166118622, + "epoch": 6.31, + "learning_rate": 1.8427726120033816e-05, + "loss": 0.47, + "step": 7470, + "task_loss": 0.08740745484828949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40922850370407104, + "epoch": 6.32, + "learning_rate": 1.8423499577345732e-05, + "loss": 0.4579, + "step": 7471, + "task_loss": 0.587205171585083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4450766444206238, + "epoch": 6.32, + "learning_rate": 1.841927303465765e-05, + "loss": 0.3305, + "step": 7472, + "task_loss": 0.3386770188808441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3301679491996765, + "epoch": 6.32, + "learning_rate": 1.841504649196957e-05, + "loss": 0.4391, + "step": 7473, + "task_loss": 0.5538355112075806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.599484384059906, + "epoch": 6.32, + "learning_rate": 1.8410819949281488e-05, + "loss": 0.5186, + "step": 7474, + "task_loss": 0.7517311573028564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5912399291992188, + "epoch": 6.32, + "learning_rate": 1.8406593406593408e-05, + "loss": 0.5197, + "step": 7475, + "task_loss": 0.9786093831062317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6440020799636841, + "epoch": 6.32, + "learning_rate": 1.8402366863905327e-05, + "loss": 0.4693, + "step": 7476, + "task_loss": 0.3304566740989685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3821730613708496, + "epoch": 6.32, + "learning_rate": 1.8398140321217244e-05, + "loss": 0.4206, + "step": 7477, + "task_loss": 0.11487775295972824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28783780336380005, + "epoch": 6.32, + "learning_rate": 1.8393913778529163e-05, + "loss": 0.4087, + "step": 7478, + "task_loss": 0.49114659428596497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5587480068206787, + "epoch": 6.32, + "learning_rate": 1.8389687235841083e-05, + "loss": 0.4221, + "step": 7479, + "task_loss": 0.9028792381286621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5051053166389465, + "epoch": 6.32, + "learning_rate": 1.8385460693153e-05, + "loss": 0.4592, + "step": 7480, + "task_loss": 1.2609680891036987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38354945182800293, + "epoch": 6.32, + "learning_rate": 1.8381234150464923e-05, + "loss": 0.4097, + "step": 7481, + "task_loss": 0.3581293523311615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5226291418075562, + "epoch": 6.32, + "learning_rate": 1.837700760777684e-05, + "loss": 0.5195, + "step": 7482, + "task_loss": 0.7903541326522827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43954920768737793, + "epoch": 6.33, + "learning_rate": 1.837278106508876e-05, + "loss": 0.3811, + "step": 7483, + "task_loss": 1.0014567375183105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39188283681869507, + "epoch": 6.33, + "learning_rate": 1.836855452240068e-05, + "loss": 0.4197, + "step": 7484, + "task_loss": 0.70308518409729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6716598272323608, + "epoch": 6.33, + "learning_rate": 1.8364327979712595e-05, + "loss": 0.3651, + "step": 7485, + "task_loss": 0.5552574396133423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33321717381477356, + "epoch": 6.33, + "learning_rate": 1.8360101437024515e-05, + "loss": 0.3888, + "step": 7486, + "task_loss": 0.42140358686447144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29450124502182007, + "epoch": 6.33, + "learning_rate": 1.8355874894336434e-05, + "loss": 0.4155, + "step": 7487, + "task_loss": 0.6798124313354492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2847437858581543, + "epoch": 6.33, + "learning_rate": 1.835164835164835e-05, + "loss": 0.4167, + "step": 7488, + "task_loss": 0.2796574532985687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27172374725341797, + "epoch": 6.33, + "learning_rate": 1.834742180896027e-05, + "loss": 0.3815, + "step": 7489, + "task_loss": 0.09973116219043732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5415390729904175, + "epoch": 6.33, + "learning_rate": 1.834319526627219e-05, + "loss": 0.4668, + "step": 7490, + "task_loss": 0.14894388616085052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4161244034767151, + "epoch": 6.33, + "learning_rate": 1.833896872358411e-05, + "loss": 0.4886, + "step": 7491, + "task_loss": 0.42361685633659363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1837972104549408, + "epoch": 6.33, + "learning_rate": 1.8334742180896026e-05, + "loss": 0.3756, + "step": 7492, + "task_loss": 0.2381788045167923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42369329929351807, + "epoch": 6.33, + "learning_rate": 1.8330515638207946e-05, + "loss": 0.4023, + "step": 7493, + "task_loss": 0.6435173153877258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4651643633842468, + "epoch": 6.33, + "learning_rate": 1.8326289095519866e-05, + "loss": 0.4336, + "step": 7494, + "task_loss": 0.5296189188957214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2605997920036316, + "epoch": 6.34, + "learning_rate": 1.8322062552831785e-05, + "loss": 0.3674, + "step": 7495, + "task_loss": 0.893089771270752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8749069571495056, + "epoch": 6.34, + "learning_rate": 1.8317836010143705e-05, + "loss": 0.494, + "step": 7496, + "task_loss": 0.9435193538665771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5157427191734314, + "epoch": 6.34, + "learning_rate": 1.831360946745562e-05, + "loss": 0.4746, + "step": 7497, + "task_loss": 0.6275808215141296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28652673959732056, + "epoch": 6.34, + "learning_rate": 1.830938292476754e-05, + "loss": 0.4654, + "step": 7498, + "task_loss": 0.36687055230140686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47160595655441284, + "epoch": 6.34, + "learning_rate": 1.830515638207946e-05, + "loss": 0.4602, + "step": 7499, + "task_loss": 0.590064525604248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5901178121566772, + "epoch": 6.34, + "learning_rate": 1.8300929839391377e-05, + "loss": 0.4143, + "step": 7500, + "task_loss": 1.0041007995605469 + }, + { + "epoch": 6.34, + "eval_accuracy": 0.9095445544554456, + "eval_loss": 0.2958400845527649, + "eval_runtime": 228.4673, + "eval_samples_per_second": 110.519, + "eval_steps_per_second": 0.867, + "step": 7500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45036858320236206, + "epoch": 6.34, + "learning_rate": 1.8296703296703297e-05, + "loss": 0.386, + "step": 7501, + "task_loss": 1.1028269529342651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4629344940185547, + "epoch": 6.34, + "learning_rate": 1.8292476754015217e-05, + "loss": 0.5115, + "step": 7502, + "task_loss": 0.9906623363494873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35838794708251953, + "epoch": 6.34, + "learning_rate": 1.8288250211327133e-05, + "loss": 0.3938, + "step": 7503, + "task_loss": 0.8167701363563538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.216829314827919, + "epoch": 6.34, + "learning_rate": 1.8284023668639056e-05, + "loss": 0.3553, + "step": 7504, + "task_loss": 0.158543199300766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.402401864528656, + "epoch": 6.34, + "learning_rate": 1.8279797125950973e-05, + "loss": 0.3801, + "step": 7505, + "task_loss": 0.20732076466083527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5993769764900208, + "epoch": 6.34, + "learning_rate": 1.8275570583262892e-05, + "loss": 0.5207, + "step": 7506, + "task_loss": 0.7096792459487915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24969404935836792, + "epoch": 6.35, + "learning_rate": 1.8271344040574812e-05, + "loss": 0.3842, + "step": 7507, + "task_loss": 0.10812999308109283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.307307630777359, + "epoch": 6.35, + "learning_rate": 1.826711749788673e-05, + "loss": 0.4051, + "step": 7508, + "task_loss": 1.055645227432251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21905399858951569, + "epoch": 6.35, + "learning_rate": 1.8262890955198648e-05, + "loss": 0.3703, + "step": 7509, + "task_loss": 0.27905556559562683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7335200309753418, + "epoch": 6.35, + "learning_rate": 1.8258664412510568e-05, + "loss": 0.5096, + "step": 7510, + "task_loss": 0.6932314038276672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23352962732315063, + "epoch": 6.35, + "learning_rate": 1.8254437869822484e-05, + "loss": 0.3776, + "step": 7511, + "task_loss": 0.8729584813117981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2948322296142578, + "epoch": 6.35, + "learning_rate": 1.8250211327134407e-05, + "loss": 0.3892, + "step": 7512, + "task_loss": 0.18286819756031036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3483908176422119, + "epoch": 6.35, + "learning_rate": 1.8245984784446324e-05, + "loss": 0.335, + "step": 7513, + "task_loss": 0.3157382905483246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.527278482913971, + "epoch": 6.35, + "learning_rate": 1.824175824175824e-05, + "loss": 0.4481, + "step": 7514, + "task_loss": 0.46831732988357544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2302231341600418, + "epoch": 6.35, + "learning_rate": 1.8237531699070163e-05, + "loss": 0.3349, + "step": 7515, + "task_loss": 0.5565590262413025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4539869427680969, + "epoch": 6.35, + "learning_rate": 1.823330515638208e-05, + "loss": 0.4749, + "step": 7516, + "task_loss": 0.6743857860565186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3175102174282074, + "epoch": 6.35, + "learning_rate": 1.8229078613694e-05, + "loss": 0.4083, + "step": 7517, + "task_loss": 0.31341618299484253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2807077169418335, + "epoch": 6.35, + "learning_rate": 1.822485207100592e-05, + "loss": 0.3712, + "step": 7518, + "task_loss": 0.28901487588882446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37975335121154785, + "epoch": 6.36, + "learning_rate": 1.8220625528317836e-05, + "loss": 0.3771, + "step": 7519, + "task_loss": 0.09109170734882355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48064905405044556, + "epoch": 6.36, + "learning_rate": 1.8216398985629755e-05, + "loss": 0.6656, + "step": 7520, + "task_loss": 0.7549939155578613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48382243514060974, + "epoch": 6.36, + "learning_rate": 1.8212172442941675e-05, + "loss": 0.4908, + "step": 7521, + "task_loss": 0.6553642749786377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8987942337989807, + "epoch": 6.36, + "learning_rate": 1.820794590025359e-05, + "loss": 0.5819, + "step": 7522, + "task_loss": 1.512727975845337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31782814860343933, + "epoch": 6.36, + "learning_rate": 1.8203719357565514e-05, + "loss": 0.4287, + "step": 7523, + "task_loss": 0.025357339531183243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4128161072731018, + "epoch": 6.36, + "learning_rate": 1.819949281487743e-05, + "loss": 0.4255, + "step": 7524, + "task_loss": 0.6650553941726685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29444167017936707, + "epoch": 6.36, + "learning_rate": 1.819526627218935e-05, + "loss": 0.4643, + "step": 7525, + "task_loss": 0.5308310985565186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5274129509925842, + "epoch": 6.36, + "learning_rate": 1.819103972950127e-05, + "loss": 0.4392, + "step": 7526, + "task_loss": 1.0899982452392578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3549845814704895, + "epoch": 6.36, + "learning_rate": 1.8186813186813187e-05, + "loss": 0.3506, + "step": 7527, + "task_loss": 0.15445595979690552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3621762990951538, + "epoch": 6.36, + "learning_rate": 1.8182586644125106e-05, + "loss": 0.4189, + "step": 7528, + "task_loss": 0.4782595932483673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36561524868011475, + "epoch": 6.36, + "learning_rate": 1.8178360101437026e-05, + "loss": 0.4063, + "step": 7529, + "task_loss": 0.9964373707771301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5830185413360596, + "epoch": 6.36, + "learning_rate": 1.8174133558748942e-05, + "loss": 0.5059, + "step": 7530, + "task_loss": 0.39107224345207214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2264605313539505, + "epoch": 6.37, + "learning_rate": 1.8169907016060862e-05, + "loss": 0.3731, + "step": 7531, + "task_loss": 0.14437159895896912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42552751302719116, + "epoch": 6.37, + "learning_rate": 1.8165680473372782e-05, + "loss": 0.4535, + "step": 7532, + "task_loss": 0.8140941858291626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6502187252044678, + "epoch": 6.37, + "learning_rate": 1.81614539306847e-05, + "loss": 0.5448, + "step": 7533, + "task_loss": 0.17116880416870117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33767756819725037, + "epoch": 6.37, + "learning_rate": 1.815722738799662e-05, + "loss": 0.4523, + "step": 7534, + "task_loss": 0.5321668982505798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45479732751846313, + "epoch": 6.37, + "learning_rate": 1.8153000845308538e-05, + "loss": 0.3906, + "step": 7535, + "task_loss": 0.5577519536018372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33437007665634155, + "epoch": 6.37, + "learning_rate": 1.8148774302620458e-05, + "loss": 0.4633, + "step": 7536, + "task_loss": 0.2395503968000412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22952744364738464, + "epoch": 6.37, + "learning_rate": 1.8144547759932377e-05, + "loss": 0.4032, + "step": 7537, + "task_loss": 0.23318517208099365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47617363929748535, + "epoch": 6.37, + "learning_rate": 1.8140321217244294e-05, + "loss": 0.4404, + "step": 7538, + "task_loss": 0.902389645576477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34737879037857056, + "epoch": 6.37, + "learning_rate": 1.8136094674556213e-05, + "loss": 0.492, + "step": 7539, + "task_loss": 0.842831552028656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39803364872932434, + "epoch": 6.37, + "learning_rate": 1.8131868131868133e-05, + "loss": 0.3803, + "step": 7540, + "task_loss": 0.3733649253845215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2669640779495239, + "epoch": 6.37, + "learning_rate": 1.8127641589180053e-05, + "loss": 0.4487, + "step": 7541, + "task_loss": 0.09107401967048645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3468136787414551, + "epoch": 6.38, + "learning_rate": 1.812341504649197e-05, + "loss": 0.3951, + "step": 7542, + "task_loss": 0.24380576610565186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35958659648895264, + "epoch": 6.38, + "learning_rate": 1.811918850380389e-05, + "loss": 0.4175, + "step": 7543, + "task_loss": 0.4848724901676178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4767422676086426, + "epoch": 6.38, + "learning_rate": 1.811496196111581e-05, + "loss": 0.4568, + "step": 7544, + "task_loss": 1.3296947479248047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30554869771003723, + "epoch": 6.38, + "learning_rate": 1.811073541842773e-05, + "loss": 0.4104, + "step": 7545, + "task_loss": 0.873447597026825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27128180861473083, + "epoch": 6.38, + "learning_rate": 1.8106508875739645e-05, + "loss": 0.3371, + "step": 7546, + "task_loss": 0.23995764553546906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30541348457336426, + "epoch": 6.38, + "learning_rate": 1.8102282333051564e-05, + "loss": 0.3588, + "step": 7547, + "task_loss": 1.0768522024154663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7742637991905212, + "epoch": 6.38, + "learning_rate": 1.8098055790363484e-05, + "loss": 0.5042, + "step": 7548, + "task_loss": 0.9513492584228516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6447891592979431, + "epoch": 6.38, + "learning_rate": 1.8093829247675404e-05, + "loss": 0.471, + "step": 7549, + "task_loss": 0.29645898938179016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23384442925453186, + "epoch": 6.38, + "learning_rate": 1.808960270498732e-05, + "loss": 0.3213, + "step": 7550, + "task_loss": 0.07928332686424255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46045786142349243, + "epoch": 6.38, + "learning_rate": 1.808537616229924e-05, + "loss": 0.4692, + "step": 7551, + "task_loss": 1.2746710777282715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47530269622802734, + "epoch": 6.38, + "learning_rate": 1.808114961961116e-05, + "loss": 0.3891, + "step": 7552, + "task_loss": 0.2463846504688263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3869553804397583, + "epoch": 6.38, + "learning_rate": 1.8076923076923076e-05, + "loss": 0.3293, + "step": 7553, + "task_loss": 1.1909068822860718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25528866052627563, + "epoch": 6.39, + "learning_rate": 1.8072696534235e-05, + "loss": 0.4568, + "step": 7554, + "task_loss": 1.2912665605545044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3567180633544922, + "epoch": 6.39, + "learning_rate": 1.8068469991546916e-05, + "loss": 0.4169, + "step": 7555, + "task_loss": 0.6024770140647888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4185871183872223, + "epoch": 6.39, + "learning_rate": 1.8064243448858832e-05, + "loss": 0.4757, + "step": 7556, + "task_loss": 0.3790021538734436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6990261673927307, + "epoch": 6.39, + "learning_rate": 1.8060016906170755e-05, + "loss": 0.6151, + "step": 7557, + "task_loss": 0.6352113485336304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41373658180236816, + "epoch": 6.39, + "learning_rate": 1.805579036348267e-05, + "loss": 0.4275, + "step": 7558, + "task_loss": 0.8403230905532837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25877469778060913, + "epoch": 6.39, + "learning_rate": 1.805156382079459e-05, + "loss": 0.413, + "step": 7559, + "task_loss": 0.801657497882843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.670606255531311, + "epoch": 6.39, + "learning_rate": 1.804733727810651e-05, + "loss": 0.4919, + "step": 7560, + "task_loss": 0.5019938945770264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5482395887374878, + "epoch": 6.39, + "learning_rate": 1.8043110735418427e-05, + "loss": 0.4876, + "step": 7561, + "task_loss": 1.5432777404785156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45872238278388977, + "epoch": 6.39, + "learning_rate": 1.8038884192730347e-05, + "loss": 0.3391, + "step": 7562, + "task_loss": 0.7343668937683105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21897876262664795, + "epoch": 6.39, + "learning_rate": 1.8034657650042267e-05, + "loss": 0.5076, + "step": 7563, + "task_loss": 0.18497462570667267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6142939925193787, + "epoch": 6.39, + "learning_rate": 1.8030431107354183e-05, + "loss": 0.5886, + "step": 7564, + "task_loss": 0.24114851653575897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5178631544113159, + "epoch": 6.39, + "learning_rate": 1.8026204564666106e-05, + "loss": 0.4778, + "step": 7565, + "task_loss": 0.7954056262969971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4254770874977112, + "epoch": 6.4, + "learning_rate": 1.8021978021978023e-05, + "loss": 0.5199, + "step": 7566, + "task_loss": 0.8415508270263672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41067150235176086, + "epoch": 6.4, + "learning_rate": 1.801775147928994e-05, + "loss": 0.375, + "step": 7567, + "task_loss": 0.77430659532547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5092525482177734, + "epoch": 6.4, + "learning_rate": 1.8013524936601862e-05, + "loss": 0.4736, + "step": 7568, + "task_loss": 0.7892408967018127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3329617381095886, + "epoch": 6.4, + "learning_rate": 1.800929839391378e-05, + "loss": 0.4791, + "step": 7569, + "task_loss": 0.0758054181933403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2235378623008728, + "epoch": 6.4, + "learning_rate": 1.8005071851225698e-05, + "loss": 0.4112, + "step": 7570, + "task_loss": 0.9776051044464111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42747461795806885, + "epoch": 6.4, + "learning_rate": 1.8000845308537618e-05, + "loss": 0.4326, + "step": 7571, + "task_loss": 0.1078256219625473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6441366076469421, + "epoch": 6.4, + "learning_rate": 1.7996618765849534e-05, + "loss": 0.475, + "step": 7572, + "task_loss": 1.3679238557815552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3199275732040405, + "epoch": 6.4, + "learning_rate": 1.7992392223161454e-05, + "loss": 0.5162, + "step": 7573, + "task_loss": 0.7930371165275574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4770206809043884, + "epoch": 6.4, + "learning_rate": 1.7988165680473374e-05, + "loss": 0.5258, + "step": 7574, + "task_loss": 1.112716555595398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5162943005561829, + "epoch": 6.4, + "learning_rate": 1.798393913778529e-05, + "loss": 0.4992, + "step": 7575, + "task_loss": 1.3846509456634521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.660395085811615, + "epoch": 6.4, + "learning_rate": 1.7979712595097213e-05, + "loss": 0.51, + "step": 7576, + "task_loss": 0.30320313572883606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3691035211086273, + "epoch": 6.4, + "learning_rate": 1.797548605240913e-05, + "loss": 0.3764, + "step": 7577, + "task_loss": 0.11380445212125778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2644013464450836, + "epoch": 6.41, + "learning_rate": 1.797125950972105e-05, + "loss": 0.4863, + "step": 7578, + "task_loss": 0.2990926504135132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35361629724502563, + "epoch": 6.41, + "learning_rate": 1.796703296703297e-05, + "loss": 0.4292, + "step": 7579, + "task_loss": 0.6158581972122192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5587254762649536, + "epoch": 6.41, + "learning_rate": 1.7962806424344885e-05, + "loss": 0.4575, + "step": 7580, + "task_loss": 0.30703744292259216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6294219493865967, + "epoch": 6.41, + "learning_rate": 1.7958579881656805e-05, + "loss": 0.4484, + "step": 7581, + "task_loss": 0.9403088688850403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.503575325012207, + "epoch": 6.41, + "learning_rate": 1.7954353338968725e-05, + "loss": 0.4307, + "step": 7582, + "task_loss": 0.2523866593837738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4844515025615692, + "epoch": 6.41, + "learning_rate": 1.7950126796280645e-05, + "loss": 0.4503, + "step": 7583, + "task_loss": 0.757355272769928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1727587729692459, + "epoch": 6.41, + "learning_rate": 1.794590025359256e-05, + "loss": 0.3645, + "step": 7584, + "task_loss": 0.3840113878250122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25857457518577576, + "epoch": 6.41, + "learning_rate": 1.794167371090448e-05, + "loss": 0.3318, + "step": 7585, + "task_loss": 0.6746025681495667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7401986122131348, + "epoch": 6.41, + "learning_rate": 1.79374471682164e-05, + "loss": 0.5505, + "step": 7586, + "task_loss": 1.4082059860229492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5529409050941467, + "epoch": 6.41, + "learning_rate": 1.793322062552832e-05, + "loss": 0.4898, + "step": 7587, + "task_loss": 1.2931978702545166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3298785984516144, + "epoch": 6.41, + "learning_rate": 1.7928994082840236e-05, + "loss": 0.3668, + "step": 7588, + "task_loss": 0.46625396609306335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39309418201446533, + "epoch": 6.41, + "learning_rate": 1.7924767540152156e-05, + "loss": 0.5826, + "step": 7589, + "task_loss": 1.2388874292373657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47886645793914795, + "epoch": 6.42, + "learning_rate": 1.7920540997464076e-05, + "loss": 0.3442, + "step": 7590, + "task_loss": 0.1339566707611084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4233910143375397, + "epoch": 6.42, + "learning_rate": 1.7916314454775996e-05, + "loss": 0.418, + "step": 7591, + "task_loss": 0.7103968262672424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23849478363990784, + "epoch": 6.42, + "learning_rate": 1.7912087912087912e-05, + "loss": 0.4414, + "step": 7592, + "task_loss": 0.18312296271324158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5746069550514221, + "epoch": 6.42, + "learning_rate": 1.7907861369399832e-05, + "loss": 0.4864, + "step": 7593, + "task_loss": 1.2621407508850098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49882417917251587, + "epoch": 6.42, + "learning_rate": 1.790363482671175e-05, + "loss": 0.4938, + "step": 7594, + "task_loss": 0.4164222776889801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5298264622688293, + "epoch": 6.42, + "learning_rate": 1.7899408284023668e-05, + "loss": 0.406, + "step": 7595, + "task_loss": 0.9813674688339233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40544670820236206, + "epoch": 6.42, + "learning_rate": 1.7895181741335588e-05, + "loss": 0.4754, + "step": 7596, + "task_loss": 0.13477036356925964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30339473485946655, + "epoch": 6.42, + "learning_rate": 1.7890955198647507e-05, + "loss": 0.2934, + "step": 7597, + "task_loss": 0.26246318221092224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5171187520027161, + "epoch": 6.42, + "learning_rate": 1.7886728655959427e-05, + "loss": 0.3553, + "step": 7598, + "task_loss": 0.2201736569404602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3105328381061554, + "epoch": 6.42, + "learning_rate": 1.7882502113271347e-05, + "loss": 0.4203, + "step": 7599, + "task_loss": 1.019685983657837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7692074775695801, + "epoch": 6.42, + "learning_rate": 1.7878275570583263e-05, + "loss": 0.4443, + "step": 7600, + "task_loss": 0.588320255279541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44372984766960144, + "epoch": 6.42, + "learning_rate": 1.7874049027895183e-05, + "loss": 0.2985, + "step": 7601, + "task_loss": 1.1915132999420166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30329430103302, + "epoch": 6.43, + "learning_rate": 1.7869822485207103e-05, + "loss": 0.264, + "step": 7602, + "task_loss": 0.18520188331604004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.576276421546936, + "epoch": 6.43, + "learning_rate": 1.786559594251902e-05, + "loss": 0.5265, + "step": 7603, + "task_loss": 0.7379361391067505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26150569319725037, + "epoch": 6.43, + "learning_rate": 1.786136939983094e-05, + "loss": 0.3709, + "step": 7604, + "task_loss": 0.6333028078079224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2518298029899597, + "epoch": 6.43, + "learning_rate": 1.785714285714286e-05, + "loss": 0.3512, + "step": 7605, + "task_loss": 0.11739340424537659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24102556705474854, + "epoch": 6.43, + "learning_rate": 1.7852916314454775e-05, + "loss": 0.3179, + "step": 7606, + "task_loss": 0.677313506603241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.376369833946228, + "epoch": 6.43, + "learning_rate": 1.7848689771766698e-05, + "loss": 0.3656, + "step": 7607, + "task_loss": 0.41247788071632385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3990139365196228, + "epoch": 6.43, + "learning_rate": 1.7844463229078614e-05, + "loss": 0.5136, + "step": 7608, + "task_loss": 1.203552007675171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35530394315719604, + "epoch": 6.43, + "learning_rate": 1.7840236686390534e-05, + "loss": 0.4456, + "step": 7609, + "task_loss": 1.049601435661316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44617512822151184, + "epoch": 6.43, + "learning_rate": 1.7836010143702454e-05, + "loss": 0.3467, + "step": 7610, + "task_loss": 0.31806641817092896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49190235137939453, + "epoch": 6.43, + "learning_rate": 1.783178360101437e-05, + "loss": 0.3908, + "step": 7611, + "task_loss": 0.7891565561294556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38216811418533325, + "epoch": 6.43, + "learning_rate": 1.782755705832629e-05, + "loss": 0.3783, + "step": 7612, + "task_loss": 0.45590490102767944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43023359775543213, + "epoch": 6.44, + "learning_rate": 1.782333051563821e-05, + "loss": 0.451, + "step": 7613, + "task_loss": 0.3245699107646942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5369499921798706, + "epoch": 6.44, + "learning_rate": 1.7819103972950126e-05, + "loss": 0.5567, + "step": 7614, + "task_loss": 1.775534749031067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6040256023406982, + "epoch": 6.44, + "learning_rate": 1.781487743026205e-05, + "loss": 0.5561, + "step": 7615, + "task_loss": 0.619125247001648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33821651339530945, + "epoch": 6.44, + "learning_rate": 1.7810650887573965e-05, + "loss": 0.4678, + "step": 7616, + "task_loss": 0.9404986500740051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5068166255950928, + "epoch": 6.44, + "learning_rate": 1.7806424344885882e-05, + "loss": 0.4455, + "step": 7617, + "task_loss": 0.20557530224323273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.646491527557373, + "epoch": 6.44, + "learning_rate": 1.7802197802197805e-05, + "loss": 0.4895, + "step": 7618, + "task_loss": 0.8594582080841064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39502227306365967, + "epoch": 6.44, + "learning_rate": 1.779797125950972e-05, + "loss": 0.4957, + "step": 7619, + "task_loss": 1.2211828231811523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35524773597717285, + "epoch": 6.44, + "learning_rate": 1.779374471682164e-05, + "loss": 0.423, + "step": 7620, + "task_loss": 0.5577415823936462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3755262494087219, + "epoch": 6.44, + "learning_rate": 1.778951817413356e-05, + "loss": 0.4732, + "step": 7621, + "task_loss": 0.07496852427721024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4170519709587097, + "epoch": 6.44, + "learning_rate": 1.7785291631445477e-05, + "loss": 0.4497, + "step": 7622, + "task_loss": 0.8204951286315918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48731687664985657, + "epoch": 6.44, + "learning_rate": 1.7781065088757397e-05, + "loss": 0.4103, + "step": 7623, + "task_loss": 0.37467193603515625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6175440549850464, + "epoch": 6.44, + "learning_rate": 1.7776838546069317e-05, + "loss": 0.3962, + "step": 7624, + "task_loss": 0.7171947956085205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44085437059402466, + "epoch": 6.45, + "learning_rate": 1.7772612003381233e-05, + "loss": 0.4773, + "step": 7625, + "task_loss": 0.34319713711738586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2469516098499298, + "epoch": 6.45, + "learning_rate": 1.7768385460693153e-05, + "loss": 0.3655, + "step": 7626, + "task_loss": 0.17120546102523804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.558559775352478, + "epoch": 6.45, + "learning_rate": 1.7764158918005072e-05, + "loss": 0.5575, + "step": 7627, + "task_loss": 0.863972544670105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4851981997489929, + "epoch": 6.45, + "learning_rate": 1.7759932375316992e-05, + "loss": 0.5302, + "step": 7628, + "task_loss": 0.6844739317893982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42256858944892883, + "epoch": 6.45, + "learning_rate": 1.7755705832628912e-05, + "loss": 0.3199, + "step": 7629, + "task_loss": 0.577381432056427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7095748782157898, + "epoch": 6.45, + "learning_rate": 1.7751479289940828e-05, + "loss": 0.4768, + "step": 7630, + "task_loss": 0.9156450629234314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6058624982833862, + "epoch": 6.45, + "learning_rate": 1.7747252747252748e-05, + "loss": 0.6172, + "step": 7631, + "task_loss": 0.04020536318421364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5482578277587891, + "epoch": 6.45, + "learning_rate": 1.7743026204564668e-05, + "loss": 0.5922, + "step": 7632, + "task_loss": 1.8229844570159912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6664584875106812, + "epoch": 6.45, + "learning_rate": 1.7738799661876584e-05, + "loss": 0.5746, + "step": 7633, + "task_loss": 0.5663739442825317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3898431360721588, + "epoch": 6.45, + "learning_rate": 1.7734573119188504e-05, + "loss": 0.383, + "step": 7634, + "task_loss": 0.9622520804405212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36202654242515564, + "epoch": 6.45, + "learning_rate": 1.7730346576500424e-05, + "loss": 0.415, + "step": 7635, + "task_loss": 0.5289613604545593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34619855880737305, + "epoch": 6.45, + "learning_rate": 1.7726120033812343e-05, + "loss": 0.3568, + "step": 7636, + "task_loss": 0.15002316236495972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3086897134780884, + "epoch": 6.46, + "learning_rate": 1.772189349112426e-05, + "loss": 0.5307, + "step": 7637, + "task_loss": 0.7400524020195007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39203473925590515, + "epoch": 6.46, + "learning_rate": 1.771766694843618e-05, + "loss": 0.4005, + "step": 7638, + "task_loss": 1.561889886856079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5477657914161682, + "epoch": 6.46, + "learning_rate": 1.77134404057481e-05, + "loss": 0.7463, + "step": 7639, + "task_loss": 0.6657342314720154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5110782384872437, + "epoch": 6.46, + "learning_rate": 1.770921386306002e-05, + "loss": 0.4163, + "step": 7640, + "task_loss": 0.455340176820755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31109434366226196, + "epoch": 6.46, + "learning_rate": 1.770498732037194e-05, + "loss": 0.4609, + "step": 7641, + "task_loss": 0.5676177144050598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48154306411743164, + "epoch": 6.46, + "learning_rate": 1.7700760777683855e-05, + "loss": 0.4617, + "step": 7642, + "task_loss": 0.38741618394851685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5222729444503784, + "epoch": 6.46, + "learning_rate": 1.7696534234995775e-05, + "loss": 0.4583, + "step": 7643, + "task_loss": 1.2537728548049927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22973501682281494, + "epoch": 6.46, + "learning_rate": 1.7692307692307694e-05, + "loss": 0.3225, + "step": 7644, + "task_loss": 0.2684953212738037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28304338455200195, + "epoch": 6.46, + "learning_rate": 1.768808114961961e-05, + "loss": 0.3579, + "step": 7645, + "task_loss": 0.5095087289810181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5840449333190918, + "epoch": 6.46, + "learning_rate": 1.768385460693153e-05, + "loss": 0.4825, + "step": 7646, + "task_loss": 0.3429000675678253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26519423723220825, + "epoch": 6.46, + "learning_rate": 1.767962806424345e-05, + "loss": 0.2672, + "step": 7647, + "task_loss": 0.21497784554958344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37983691692352295, + "epoch": 6.46, + "learning_rate": 1.7675401521555367e-05, + "loss": 0.3075, + "step": 7648, + "task_loss": 1.3823068141937256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3351721167564392, + "epoch": 6.47, + "learning_rate": 1.767117497886729e-05, + "loss": 0.3132, + "step": 7649, + "task_loss": 0.2256188541650772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7568562030792236, + "epoch": 6.47, + "learning_rate": 1.7666948436179206e-05, + "loss": 0.5098, + "step": 7650, + "task_loss": 0.42638882994651794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24094784259796143, + "epoch": 6.47, + "learning_rate": 1.7662721893491126e-05, + "loss": 0.3808, + "step": 7651, + "task_loss": 0.643253743648529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2730315923690796, + "epoch": 6.47, + "learning_rate": 1.7658495350803046e-05, + "loss": 0.4613, + "step": 7652, + "task_loss": 0.6675654053688049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4155542254447937, + "epoch": 6.47, + "learning_rate": 1.7654268808114962e-05, + "loss": 0.5118, + "step": 7653, + "task_loss": 0.30436116456985474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3908953070640564, + "epoch": 6.47, + "learning_rate": 1.765004226542688e-05, + "loss": 0.4511, + "step": 7654, + "task_loss": 0.170237734913826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24735316634178162, + "epoch": 6.47, + "learning_rate": 1.76458157227388e-05, + "loss": 0.273, + "step": 7655, + "task_loss": 0.17232540249824524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40924322605133057, + "epoch": 6.47, + "learning_rate": 1.7641589180050718e-05, + "loss": 0.3648, + "step": 7656, + "task_loss": 0.42511653900146484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4168434739112854, + "epoch": 6.47, + "learning_rate": 1.763736263736264e-05, + "loss": 0.379, + "step": 7657, + "task_loss": 0.09970150142908096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6869746446609497, + "epoch": 6.47, + "learning_rate": 1.7633136094674557e-05, + "loss": 0.4418, + "step": 7658, + "task_loss": 0.8750053644180298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3677084445953369, + "epoch": 6.47, + "learning_rate": 1.7628909551986474e-05, + "loss": 0.3885, + "step": 7659, + "task_loss": 0.2624224126338959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36935144662857056, + "epoch": 6.47, + "learning_rate": 1.7624683009298397e-05, + "loss": 0.3242, + "step": 7660, + "task_loss": 0.11176121979951859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17497488856315613, + "epoch": 6.48, + "learning_rate": 1.7620456466610313e-05, + "loss": 0.3569, + "step": 7661, + "task_loss": 0.4668859541416168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25344911217689514, + "epoch": 6.48, + "learning_rate": 1.7616229923922233e-05, + "loss": 0.4445, + "step": 7662, + "task_loss": 0.7531964182853699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29672592878341675, + "epoch": 6.48, + "learning_rate": 1.7612003381234152e-05, + "loss": 0.468, + "step": 7663, + "task_loss": 0.15726575255393982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5118647217750549, + "epoch": 6.48, + "learning_rate": 1.760777683854607e-05, + "loss": 0.5952, + "step": 7664, + "task_loss": 0.8347459435462952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46826356649398804, + "epoch": 6.48, + "learning_rate": 1.760355029585799e-05, + "loss": 0.4859, + "step": 7665, + "task_loss": 1.637624979019165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5311981439590454, + "epoch": 6.48, + "learning_rate": 1.759932375316991e-05, + "loss": 0.5427, + "step": 7666, + "task_loss": 0.8321746587753296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3801642954349518, + "epoch": 6.48, + "learning_rate": 1.7595097210481825e-05, + "loss": 0.3187, + "step": 7667, + "task_loss": 0.5472202897071838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48967641592025757, + "epoch": 6.48, + "learning_rate": 1.7590870667793748e-05, + "loss": 0.3944, + "step": 7668, + "task_loss": 0.5914241075515747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3747219145298004, + "epoch": 6.48, + "learning_rate": 1.7586644125105664e-05, + "loss": 0.3464, + "step": 7669, + "task_loss": 0.6315202116966248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3646041452884674, + "epoch": 6.48, + "learning_rate": 1.7582417582417584e-05, + "loss": 0.3837, + "step": 7670, + "task_loss": 1.2037979364395142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30353668332099915, + "epoch": 6.48, + "learning_rate": 1.7578191039729504e-05, + "loss": 0.4312, + "step": 7671, + "task_loss": 0.7560117840766907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5347833633422852, + "epoch": 6.48, + "learning_rate": 1.757396449704142e-05, + "loss": 0.4328, + "step": 7672, + "task_loss": 0.527636706829071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33705028891563416, + "epoch": 6.49, + "learning_rate": 1.756973795435334e-05, + "loss": 0.3887, + "step": 7673, + "task_loss": 0.24453981220722198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.374274343252182, + "epoch": 6.49, + "learning_rate": 1.756551141166526e-05, + "loss": 0.4261, + "step": 7674, + "task_loss": 0.3021724820137024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4270437955856323, + "epoch": 6.49, + "learning_rate": 1.7561284868977176e-05, + "loss": 0.3873, + "step": 7675, + "task_loss": 0.13286586105823517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44173240661621094, + "epoch": 6.49, + "learning_rate": 1.7557058326289096e-05, + "loss": 0.3775, + "step": 7676, + "task_loss": 0.5347998738288879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2840273380279541, + "epoch": 6.49, + "learning_rate": 1.7552831783601015e-05, + "loss": 0.373, + "step": 7677, + "task_loss": 0.10822489857673645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4916588068008423, + "epoch": 6.49, + "learning_rate": 1.7548605240912935e-05, + "loss": 0.4731, + "step": 7678, + "task_loss": 0.4730340838432312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39222168922424316, + "epoch": 6.49, + "learning_rate": 1.7544378698224855e-05, + "loss": 0.5038, + "step": 7679, + "task_loss": 0.4105307161808014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3812994360923767, + "epoch": 6.49, + "learning_rate": 1.754015215553677e-05, + "loss": 0.6001, + "step": 7680, + "task_loss": 1.0733082294464111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23272617161273956, + "epoch": 6.49, + "learning_rate": 1.753592561284869e-05, + "loss": 0.3328, + "step": 7681, + "task_loss": 0.16583546996116638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34671711921691895, + "epoch": 6.49, + "learning_rate": 1.753169907016061e-05, + "loss": 0.3929, + "step": 7682, + "task_loss": 0.0851937010884285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30802252888679504, + "epoch": 6.49, + "learning_rate": 1.7527472527472527e-05, + "loss": 0.4577, + "step": 7683, + "task_loss": 0.028649890795350075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27338820695877075, + "epoch": 6.5, + "learning_rate": 1.7523245984784447e-05, + "loss": 0.5321, + "step": 7684, + "task_loss": 0.3567676842212677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33301764726638794, + "epoch": 6.5, + "learning_rate": 1.7519019442096366e-05, + "loss": 0.366, + "step": 7685, + "task_loss": 0.8334814310073853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2736169993877411, + "epoch": 6.5, + "learning_rate": 1.7514792899408286e-05, + "loss": 0.329, + "step": 7686, + "task_loss": 0.6179097294807434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4419115483760834, + "epoch": 6.5, + "learning_rate": 1.7510566356720203e-05, + "loss": 0.4189, + "step": 7687, + "task_loss": 1.06377112865448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38894888758659363, + "epoch": 6.5, + "learning_rate": 1.7506339814032122e-05, + "loss": 0.4029, + "step": 7688, + "task_loss": 0.5451747179031372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38339364528656006, + "epoch": 6.5, + "learning_rate": 1.7502113271344042e-05, + "loss": 0.4093, + "step": 7689, + "task_loss": 0.7164480090141296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9603133201599121, + "epoch": 6.5, + "learning_rate": 1.749788672865596e-05, + "loss": 0.5462, + "step": 7690, + "task_loss": 1.6306289434432983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6875299215316772, + "epoch": 6.5, + "learning_rate": 1.7493660185967878e-05, + "loss": 0.5243, + "step": 7691, + "task_loss": 0.9978961944580078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2090688794851303, + "epoch": 6.5, + "learning_rate": 1.7489433643279798e-05, + "loss": 0.3599, + "step": 7692, + "task_loss": 0.08115692436695099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.434048593044281, + "epoch": 6.5, + "learning_rate": 1.7485207100591718e-05, + "loss": 0.3888, + "step": 7693, + "task_loss": 0.5820990800857544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2986835241317749, + "epoch": 6.5, + "learning_rate": 1.7480980557903637e-05, + "loss": 0.2882, + "step": 7694, + "task_loss": 0.27528345584869385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31939250230789185, + "epoch": 6.5, + "learning_rate": 1.7476754015215554e-05, + "loss": 0.4142, + "step": 7695, + "task_loss": 0.2997313141822815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.52631014585495, + "epoch": 6.51, + "learning_rate": 1.7472527472527473e-05, + "loss": 0.4716, + "step": 7696, + "task_loss": 1.1308040618896484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3174107074737549, + "epoch": 6.51, + "learning_rate": 1.7468300929839393e-05, + "loss": 0.3862, + "step": 7697, + "task_loss": 0.9476327896118164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.548916220664978, + "epoch": 6.51, + "learning_rate": 1.746407438715131e-05, + "loss": 0.3609, + "step": 7698, + "task_loss": 0.5129746198654175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3017251789569855, + "epoch": 6.51, + "learning_rate": 1.7459847844463233e-05, + "loss": 0.5279, + "step": 7699, + "task_loss": 0.21507754921913147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5731245279312134, + "epoch": 6.51, + "learning_rate": 1.745562130177515e-05, + "loss": 0.5437, + "step": 7700, + "task_loss": 0.8468421101570129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.548230767250061, + "epoch": 6.51, + "learning_rate": 1.7451394759087065e-05, + "loss": 0.52, + "step": 7701, + "task_loss": 0.34524816274642944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2860320508480072, + "epoch": 6.51, + "learning_rate": 1.744716821639899e-05, + "loss": 0.4362, + "step": 7702, + "task_loss": 0.32338404655456543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35994386672973633, + "epoch": 6.51, + "learning_rate": 1.7442941673710905e-05, + "loss": 0.3644, + "step": 7703, + "task_loss": 0.6820021271705627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2620415687561035, + "epoch": 6.51, + "learning_rate": 1.7438715131022825e-05, + "loss": 0.4389, + "step": 7704, + "task_loss": 0.46701785922050476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46705323457717896, + "epoch": 6.51, + "learning_rate": 1.7434488588334744e-05, + "loss": 0.4018, + "step": 7705, + "task_loss": 0.3430244028568268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4521525502204895, + "epoch": 6.51, + "learning_rate": 1.743026204564666e-05, + "loss": 0.3825, + "step": 7706, + "task_loss": 0.44167059659957886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33179032802581787, + "epoch": 6.51, + "learning_rate": 1.742603550295858e-05, + "loss": 0.4463, + "step": 7707, + "task_loss": 0.832005500793457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45206132531166077, + "epoch": 6.52, + "learning_rate": 1.74218089602705e-05, + "loss": 0.3853, + "step": 7708, + "task_loss": 1.3893319368362427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43434131145477295, + "epoch": 6.52, + "learning_rate": 1.7417582417582416e-05, + "loss": 0.4519, + "step": 7709, + "task_loss": 0.6699163317680359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5219091773033142, + "epoch": 6.52, + "learning_rate": 1.741335587489434e-05, + "loss": 0.4904, + "step": 7710, + "task_loss": 0.7903644442558289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5327441096305847, + "epoch": 6.52, + "learning_rate": 1.7409129332206256e-05, + "loss": 0.6486, + "step": 7711, + "task_loss": 0.20667216181755066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.343507319688797, + "epoch": 6.52, + "learning_rate": 1.7404902789518172e-05, + "loss": 0.389, + "step": 7712, + "task_loss": 0.9039720296859741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3960939943790436, + "epoch": 6.52, + "learning_rate": 1.7400676246830095e-05, + "loss": 0.47, + "step": 7713, + "task_loss": 0.27050456404685974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3591465353965759, + "epoch": 6.52, + "learning_rate": 1.7396449704142012e-05, + "loss": 0.4253, + "step": 7714, + "task_loss": 0.5246472954750061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.284004271030426, + "epoch": 6.52, + "learning_rate": 1.739222316145393e-05, + "loss": 0.4174, + "step": 7715, + "task_loss": 0.8015879392623901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2273007035255432, + "epoch": 6.52, + "learning_rate": 1.738799661876585e-05, + "loss": 0.3425, + "step": 7716, + "task_loss": 0.40578991174697876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3479277789592743, + "epoch": 6.52, + "learning_rate": 1.7383770076077768e-05, + "loss": 0.4002, + "step": 7717, + "task_loss": 0.6750699877738953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4607001841068268, + "epoch": 6.52, + "learning_rate": 1.7379543533389687e-05, + "loss": 0.5117, + "step": 7718, + "task_loss": 0.6770851612091064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.315701961517334, + "epoch": 6.52, + "learning_rate": 1.7375316990701607e-05, + "loss": 0.4307, + "step": 7719, + "task_loss": 0.7415759563446045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4673118591308594, + "epoch": 6.53, + "learning_rate": 1.7371090448013523e-05, + "loss": 0.4934, + "step": 7720, + "task_loss": 0.9815607666969299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.584915041923523, + "epoch": 6.53, + "learning_rate": 1.7366863905325447e-05, + "loss": 0.3955, + "step": 7721, + "task_loss": 0.5981960296630859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4899916648864746, + "epoch": 6.53, + "learning_rate": 1.7362637362637363e-05, + "loss": 0.5718, + "step": 7722, + "task_loss": 0.8930051922798157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.413751482963562, + "epoch": 6.53, + "learning_rate": 1.7358410819949283e-05, + "loss": 0.491, + "step": 7723, + "task_loss": 0.29914146661758423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4485185146331787, + "epoch": 6.53, + "learning_rate": 1.7354184277261202e-05, + "loss": 0.5704, + "step": 7724, + "task_loss": 0.36231741309165955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4056541621685028, + "epoch": 6.53, + "learning_rate": 1.734995773457312e-05, + "loss": 0.406, + "step": 7725, + "task_loss": 1.0025750398635864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3242490887641907, + "epoch": 6.53, + "learning_rate": 1.734573119188504e-05, + "loss": 0.445, + "step": 7726, + "task_loss": 0.7842551469802856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44807493686676025, + "epoch": 6.53, + "learning_rate": 1.7341504649196958e-05, + "loss": 0.3608, + "step": 7727, + "task_loss": 0.23884382843971252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21364375948905945, + "epoch": 6.53, + "learning_rate": 1.7337278106508875e-05, + "loss": 0.3179, + "step": 7728, + "task_loss": 0.02064407989382744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3939073085784912, + "epoch": 6.53, + "learning_rate": 1.7333051563820794e-05, + "loss": 0.363, + "step": 7729, + "task_loss": 0.5518420338630676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33679160475730896, + "epoch": 6.53, + "learning_rate": 1.7328825021132714e-05, + "loss": 0.3854, + "step": 7730, + "task_loss": 0.480367511510849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44632411003112793, + "epoch": 6.53, + "learning_rate": 1.7324598478444634e-05, + "loss": 0.4849, + "step": 7731, + "task_loss": 0.3868582546710968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3780495524406433, + "epoch": 6.54, + "learning_rate": 1.7320371935756553e-05, + "loss": 0.3508, + "step": 7732, + "task_loss": 0.9864240288734436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34663525223731995, + "epoch": 6.54, + "learning_rate": 1.731614539306847e-05, + "loss": 0.3621, + "step": 7733, + "task_loss": 0.609247088432312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4681684374809265, + "epoch": 6.54, + "learning_rate": 1.731191885038039e-05, + "loss": 0.4869, + "step": 7734, + "task_loss": 0.5776069164276123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35782939195632935, + "epoch": 6.54, + "learning_rate": 1.730769230769231e-05, + "loss": 0.348, + "step": 7735, + "task_loss": 0.37651339173316956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3047736585140228, + "epoch": 6.54, + "learning_rate": 1.730346576500423e-05, + "loss": 0.3089, + "step": 7736, + "task_loss": 1.1231772899627686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6444770097732544, + "epoch": 6.54, + "learning_rate": 1.7299239222316145e-05, + "loss": 0.4698, + "step": 7737, + "task_loss": 1.2928693294525146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44074711203575134, + "epoch": 6.54, + "learning_rate": 1.7295012679628065e-05, + "loss": 0.368, + "step": 7738, + "task_loss": 0.5922412276268005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4281979203224182, + "epoch": 6.54, + "learning_rate": 1.7290786136939985e-05, + "loss": 0.3498, + "step": 7739, + "task_loss": 0.41255372762680054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21640227735042572, + "epoch": 6.54, + "learning_rate": 1.72865595942519e-05, + "loss": 0.3073, + "step": 7740, + "task_loss": 0.42043188214302063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38790735602378845, + "epoch": 6.54, + "learning_rate": 1.728233305156382e-05, + "loss": 0.4182, + "step": 7741, + "task_loss": 0.4633443057537079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24173420667648315, + "epoch": 6.54, + "learning_rate": 1.727810650887574e-05, + "loss": 0.2969, + "step": 7742, + "task_loss": 0.166888028383255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5460012555122375, + "epoch": 6.54, + "learning_rate": 1.727387996618766e-05, + "loss": 0.3732, + "step": 7743, + "task_loss": 0.6136801838874817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5152308940887451, + "epoch": 6.55, + "learning_rate": 1.726965342349958e-05, + "loss": 0.4624, + "step": 7744, + "task_loss": 0.5298154950141907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3607529401779175, + "epoch": 6.55, + "learning_rate": 1.7265426880811497e-05, + "loss": 0.4467, + "step": 7745, + "task_loss": 0.619924783706665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4113730788230896, + "epoch": 6.55, + "learning_rate": 1.7261200338123416e-05, + "loss": 0.5192, + "step": 7746, + "task_loss": 0.19888818264007568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8641488552093506, + "epoch": 6.55, + "learning_rate": 1.7256973795435336e-05, + "loss": 0.6035, + "step": 7747, + "task_loss": 0.9087063074111938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38239723443984985, + "epoch": 6.55, + "learning_rate": 1.7252747252747252e-05, + "loss": 0.3608, + "step": 7748, + "task_loss": 0.07301051169633865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6883953809738159, + "epoch": 6.55, + "learning_rate": 1.7248520710059172e-05, + "loss": 0.633, + "step": 7749, + "task_loss": 1.3126689195632935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33494848012924194, + "epoch": 6.55, + "learning_rate": 1.7244294167371092e-05, + "loss": 0.4549, + "step": 7750, + "task_loss": 0.700218141078949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3619251847267151, + "epoch": 6.55, + "learning_rate": 1.7240067624683008e-05, + "loss": 0.4097, + "step": 7751, + "task_loss": 0.2858006954193115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37588953971862793, + "epoch": 6.55, + "learning_rate": 1.723584108199493e-05, + "loss": 0.4991, + "step": 7752, + "task_loss": 0.7141557931900024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7976861000061035, + "epoch": 6.55, + "learning_rate": 1.7231614539306848e-05, + "loss": 0.4785, + "step": 7753, + "task_loss": 0.3111616373062134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4063098728656769, + "epoch": 6.55, + "learning_rate": 1.7227387996618764e-05, + "loss": 0.5148, + "step": 7754, + "task_loss": 1.5374327898025513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.004528522491455, + "epoch": 6.56, + "learning_rate": 1.7223161453930687e-05, + "loss": 0.574, + "step": 7755, + "task_loss": 1.3567016124725342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2833079993724823, + "epoch": 6.56, + "learning_rate": 1.7218934911242603e-05, + "loss": 0.3788, + "step": 7756, + "task_loss": 0.3709709346294403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.317840576171875, + "epoch": 6.56, + "learning_rate": 1.7214708368554523e-05, + "loss": 0.389, + "step": 7757, + "task_loss": 0.6490671038627625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4796914756298065, + "epoch": 6.56, + "learning_rate": 1.7210481825866443e-05, + "loss": 0.471, + "step": 7758, + "task_loss": 0.71211838722229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41714051365852356, + "epoch": 6.56, + "learning_rate": 1.720625528317836e-05, + "loss": 0.3597, + "step": 7759, + "task_loss": 0.5786429047584534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3412782847881317, + "epoch": 6.56, + "learning_rate": 1.7202028740490282e-05, + "loss": 0.5038, + "step": 7760, + "task_loss": 0.248692125082016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6352635622024536, + "epoch": 6.56, + "learning_rate": 1.71978021978022e-05, + "loss": 0.5103, + "step": 7761, + "task_loss": 0.8469898700714111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29890939593315125, + "epoch": 6.56, + "learning_rate": 1.7193575655114115e-05, + "loss": 0.3646, + "step": 7762, + "task_loss": 0.5046116709709167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5203819274902344, + "epoch": 6.56, + "learning_rate": 1.7189349112426038e-05, + "loss": 0.4202, + "step": 7763, + "task_loss": 0.4697435200214386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39496156573295593, + "epoch": 6.56, + "learning_rate": 1.7185122569737955e-05, + "loss": 0.3696, + "step": 7764, + "task_loss": 0.6271415948867798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37355777621269226, + "epoch": 6.56, + "learning_rate": 1.7180896027049874e-05, + "loss": 0.4259, + "step": 7765, + "task_loss": 0.9390147924423218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42374247312545776, + "epoch": 6.56, + "learning_rate": 1.7176669484361794e-05, + "loss": 0.4187, + "step": 7766, + "task_loss": 0.29991015791893005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21618777513504028, + "epoch": 6.57, + "learning_rate": 1.717244294167371e-05, + "loss": 0.3345, + "step": 7767, + "task_loss": 0.30570247769355774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.189159095287323, + "epoch": 6.57, + "learning_rate": 1.716821639898563e-05, + "loss": 0.4148, + "step": 7768, + "task_loss": 0.5018526315689087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1102922335267067, + "epoch": 6.57, + "learning_rate": 1.716398985629755e-05, + "loss": 0.2867, + "step": 7769, + "task_loss": 0.0073256222531199455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49159306287765503, + "epoch": 6.57, + "learning_rate": 1.7159763313609466e-05, + "loss": 0.463, + "step": 7770, + "task_loss": 0.31217604875564575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6200627088546753, + "epoch": 6.57, + "learning_rate": 1.7155536770921386e-05, + "loss": 0.5146, + "step": 7771, + "task_loss": 0.7909308671951294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3019719421863556, + "epoch": 6.57, + "learning_rate": 1.7151310228233306e-05, + "loss": 0.3225, + "step": 7772, + "task_loss": 0.3400425612926483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6875807046890259, + "epoch": 6.57, + "learning_rate": 1.7147083685545225e-05, + "loss": 0.494, + "step": 7773, + "task_loss": 1.2067588567733765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.461720734834671, + "epoch": 6.57, + "learning_rate": 1.7142857142857145e-05, + "loss": 0.6045, + "step": 7774, + "task_loss": 0.6054428219795227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5062723755836487, + "epoch": 6.57, + "learning_rate": 1.713863060016906e-05, + "loss": 0.4223, + "step": 7775, + "task_loss": 0.8263669013977051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31832355260849, + "epoch": 6.57, + "learning_rate": 1.713440405748098e-05, + "loss": 0.3156, + "step": 7776, + "task_loss": 1.0313677787780762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4299007058143616, + "epoch": 6.57, + "learning_rate": 1.71301775147929e-05, + "loss": 0.4353, + "step": 7777, + "task_loss": 0.41176527738571167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49230480194091797, + "epoch": 6.57, + "learning_rate": 1.7125950972104817e-05, + "loss": 0.4562, + "step": 7778, + "task_loss": 1.2623660564422607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3376258611679077, + "epoch": 6.58, + "learning_rate": 1.7121724429416737e-05, + "loss": 0.4789, + "step": 7779, + "task_loss": 0.7366155982017517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3781077265739441, + "epoch": 6.58, + "learning_rate": 1.7117497886728657e-05, + "loss": 0.4186, + "step": 7780, + "task_loss": 0.7562388181686401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3058122396469116, + "epoch": 6.58, + "learning_rate": 1.7113271344040577e-05, + "loss": 0.435, + "step": 7781, + "task_loss": 0.7692509889602661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26906147599220276, + "epoch": 6.58, + "learning_rate": 1.7109044801352493e-05, + "loss": 0.3771, + "step": 7782, + "task_loss": 0.13587145507335663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5102202892303467, + "epoch": 6.58, + "learning_rate": 1.7104818258664413e-05, + "loss": 0.4139, + "step": 7783, + "task_loss": 1.2248032093048096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49844905734062195, + "epoch": 6.58, + "learning_rate": 1.7100591715976332e-05, + "loss": 0.4452, + "step": 7784, + "task_loss": 0.8754245042800903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2663879990577698, + "epoch": 6.58, + "learning_rate": 1.7096365173288252e-05, + "loss": 0.3024, + "step": 7785, + "task_loss": 0.16097551584243774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3796393871307373, + "epoch": 6.58, + "learning_rate": 1.709213863060017e-05, + "loss": 0.6043, + "step": 7786, + "task_loss": 1.1156126260757446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7183825969696045, + "epoch": 6.58, + "learning_rate": 1.7087912087912088e-05, + "loss": 0.408, + "step": 7787, + "task_loss": 1.122050166130066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4500882625579834, + "epoch": 6.58, + "learning_rate": 1.7083685545224008e-05, + "loss": 0.3446, + "step": 7788, + "task_loss": 0.4593886137008667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42194950580596924, + "epoch": 6.58, + "learning_rate": 1.7079459002535928e-05, + "loss": 0.3573, + "step": 7789, + "task_loss": 0.42660364508628845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27095115184783936, + "epoch": 6.58, + "learning_rate": 1.7075232459847844e-05, + "loss": 0.3464, + "step": 7790, + "task_loss": 0.20211376249790192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34839215874671936, + "epoch": 6.59, + "learning_rate": 1.7071005917159764e-05, + "loss": 0.433, + "step": 7791, + "task_loss": 0.5181761980056763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6757053732872009, + "epoch": 6.59, + "learning_rate": 1.7066779374471684e-05, + "loss": 0.4329, + "step": 7792, + "task_loss": 1.353811502456665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.309563547372818, + "epoch": 6.59, + "learning_rate": 1.70625528317836e-05, + "loss": 0.4309, + "step": 7793, + "task_loss": 0.885120689868927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5359622836112976, + "epoch": 6.59, + "learning_rate": 1.7058326289095523e-05, + "loss": 0.5483, + "step": 7794, + "task_loss": 0.6671273708343506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3897382318973541, + "epoch": 6.59, + "learning_rate": 1.705409974640744e-05, + "loss": 0.4413, + "step": 7795, + "task_loss": 0.3334895074367523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32747477293014526, + "epoch": 6.59, + "learning_rate": 1.704987320371936e-05, + "loss": 0.3572, + "step": 7796, + "task_loss": 0.738261342048645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3865398168563843, + "epoch": 6.59, + "learning_rate": 1.704564666103128e-05, + "loss": 0.3786, + "step": 7797, + "task_loss": 0.28624165058135986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2486703097820282, + "epoch": 6.59, + "learning_rate": 1.7041420118343195e-05, + "loss": 0.2885, + "step": 7798, + "task_loss": 0.4943665564060211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3966832160949707, + "epoch": 6.59, + "learning_rate": 1.7037193575655115e-05, + "loss": 0.4402, + "step": 7799, + "task_loss": 0.4427167773246765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26261886954307556, + "epoch": 6.59, + "learning_rate": 1.7032967032967035e-05, + "loss": 0.4306, + "step": 7800, + "task_loss": 0.13064832985401154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32483941316604614, + "epoch": 6.59, + "learning_rate": 1.702874049027895e-05, + "loss": 0.4874, + "step": 7801, + "task_loss": 0.1987677365541458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2856143116950989, + "epoch": 6.59, + "learning_rate": 1.7024513947590874e-05, + "loss": 0.3863, + "step": 7802, + "task_loss": 0.5596334338188171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5095064640045166, + "epoch": 6.6, + "learning_rate": 1.702028740490279e-05, + "loss": 0.4872, + "step": 7803, + "task_loss": 0.8551172614097595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4662837088108063, + "epoch": 6.6, + "learning_rate": 1.7016060862214707e-05, + "loss": 0.4829, + "step": 7804, + "task_loss": 0.641343891620636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3950001895427704, + "epoch": 6.6, + "learning_rate": 1.701183431952663e-05, + "loss": 0.4731, + "step": 7805, + "task_loss": 0.7662146091461182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5467774271965027, + "epoch": 6.6, + "learning_rate": 1.7007607776838546e-05, + "loss": 0.5333, + "step": 7806, + "task_loss": 0.9715954661369324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9037359952926636, + "epoch": 6.6, + "learning_rate": 1.7003381234150466e-05, + "loss": 0.4514, + "step": 7807, + "task_loss": 1.1113052368164062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34200286865234375, + "epoch": 6.6, + "learning_rate": 1.6999154691462386e-05, + "loss": 0.4477, + "step": 7808, + "task_loss": 0.601847231388092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7178307771682739, + "epoch": 6.6, + "learning_rate": 1.6994928148774302e-05, + "loss": 0.4622, + "step": 7809, + "task_loss": 1.2305738925933838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49173447489738464, + "epoch": 6.6, + "learning_rate": 1.6990701606086222e-05, + "loss": 0.5046, + "step": 7810, + "task_loss": 0.3537293076515198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2590474486351013, + "epoch": 6.6, + "learning_rate": 1.698647506339814e-05, + "loss": 0.3872, + "step": 7811, + "task_loss": 0.9678168892860413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29836034774780273, + "epoch": 6.6, + "learning_rate": 1.6982248520710058e-05, + "loss": 0.4852, + "step": 7812, + "task_loss": 0.1088179275393486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2780569791793823, + "epoch": 6.6, + "learning_rate": 1.697802197802198e-05, + "loss": 0.442, + "step": 7813, + "task_loss": 0.3472524881362915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.438448965549469, + "epoch": 6.6, + "learning_rate": 1.6973795435333898e-05, + "loss": 0.423, + "step": 7814, + "task_loss": 0.25880756974220276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.411201536655426, + "epoch": 6.61, + "learning_rate": 1.6969568892645814e-05, + "loss": 0.4484, + "step": 7815, + "task_loss": 0.4490131139755249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26037681102752686, + "epoch": 6.61, + "learning_rate": 1.6965342349957737e-05, + "loss": 0.375, + "step": 7816, + "task_loss": 0.5130388736724854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.472415566444397, + "epoch": 6.61, + "learning_rate": 1.6961115807269653e-05, + "loss": 0.4606, + "step": 7817, + "task_loss": 1.0230894088745117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6055726408958435, + "epoch": 6.61, + "learning_rate": 1.6956889264581573e-05, + "loss": 0.5151, + "step": 7818, + "task_loss": 0.6419128775596619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5813463926315308, + "epoch": 6.61, + "learning_rate": 1.6952662721893493e-05, + "loss": 0.4788, + "step": 7819, + "task_loss": 0.9937012195587158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5667768716812134, + "epoch": 6.61, + "learning_rate": 1.694843617920541e-05, + "loss": 0.4229, + "step": 7820, + "task_loss": 1.1213327646255493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3345906138420105, + "epoch": 6.61, + "learning_rate": 1.694420963651733e-05, + "loss": 0.3897, + "step": 7821, + "task_loss": 1.0003970861434937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4654967188835144, + "epoch": 6.61, + "learning_rate": 1.693998309382925e-05, + "loss": 0.505, + "step": 7822, + "task_loss": 0.5672613382339478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39240339398384094, + "epoch": 6.61, + "learning_rate": 1.693575655114117e-05, + "loss": 0.5433, + "step": 7823, + "task_loss": 0.776161789894104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3743574023246765, + "epoch": 6.61, + "learning_rate": 1.6931530008453088e-05, + "loss": 0.5195, + "step": 7824, + "task_loss": 0.7787492275238037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2515353858470917, + "epoch": 6.61, + "learning_rate": 1.6927303465765004e-05, + "loss": 0.2995, + "step": 7825, + "task_loss": 0.18031682074069977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5146732330322266, + "epoch": 6.61, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.4757, + "step": 7826, + "task_loss": 0.9696012139320374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5856581926345825, + "epoch": 6.62, + "learning_rate": 1.6918850380388844e-05, + "loss": 0.4807, + "step": 7827, + "task_loss": 1.7668358087539673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41958820819854736, + "epoch": 6.62, + "learning_rate": 1.691462383770076e-05, + "loss": 0.418, + "step": 7828, + "task_loss": 0.9741309881210327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31739336252212524, + "epoch": 6.62, + "learning_rate": 1.691039729501268e-05, + "loss": 0.4027, + "step": 7829, + "task_loss": 0.6754883527755737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27387967705726624, + "epoch": 6.62, + "learning_rate": 1.69061707523246e-05, + "loss": 0.6698, + "step": 7830, + "task_loss": 0.5085774660110474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4034919738769531, + "epoch": 6.62, + "learning_rate": 1.690194420963652e-05, + "loss": 0.4253, + "step": 7831, + "task_loss": 0.4528971314430237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30572280287742615, + "epoch": 6.62, + "learning_rate": 1.6897717666948436e-05, + "loss": 0.2868, + "step": 7832, + "task_loss": 0.13664884865283966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2029648870229721, + "epoch": 6.62, + "learning_rate": 1.6893491124260356e-05, + "loss": 0.4788, + "step": 7833, + "task_loss": 0.6416718363761902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.834963321685791, + "epoch": 6.62, + "learning_rate": 1.6889264581572275e-05, + "loss": 0.5773, + "step": 7834, + "task_loss": 0.9494382739067078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2479391247034073, + "epoch": 6.62, + "learning_rate": 1.688503803888419e-05, + "loss": 0.3864, + "step": 7835, + "task_loss": 0.5319033861160278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36878520250320435, + "epoch": 6.62, + "learning_rate": 1.688081149619611e-05, + "loss": 0.5437, + "step": 7836, + "task_loss": 1.075903058052063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3124607801437378, + "epoch": 6.62, + "learning_rate": 1.687658495350803e-05, + "loss": 0.4679, + "step": 7837, + "task_loss": 0.30980852246284485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3998446762561798, + "epoch": 6.63, + "learning_rate": 1.687235841081995e-05, + "loss": 0.416, + "step": 7838, + "task_loss": 1.0167372226715088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42109233140945435, + "epoch": 6.63, + "learning_rate": 1.686813186813187e-05, + "loss": 0.5737, + "step": 7839, + "task_loss": 1.477961540222168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4839189648628235, + "epoch": 6.63, + "learning_rate": 1.6863905325443787e-05, + "loss": 0.4945, + "step": 7840, + "task_loss": 1.487709879875183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2645489275455475, + "epoch": 6.63, + "learning_rate": 1.6859678782755707e-05, + "loss": 0.4852, + "step": 7841, + "task_loss": 0.009127922356128693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38056236505508423, + "epoch": 6.63, + "learning_rate": 1.6855452240067626e-05, + "loss": 0.4751, + "step": 7842, + "task_loss": 0.35602596402168274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3730393946170807, + "epoch": 6.63, + "learning_rate": 1.6851225697379543e-05, + "loss": 0.5508, + "step": 7843, + "task_loss": 0.657197117805481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3719111979007721, + "epoch": 6.63, + "learning_rate": 1.6846999154691463e-05, + "loss": 0.3195, + "step": 7844, + "task_loss": 0.4179593622684479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31686052680015564, + "epoch": 6.63, + "learning_rate": 1.6842772612003382e-05, + "loss": 0.4291, + "step": 7845, + "task_loss": 0.7979422807693481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6795693635940552, + "epoch": 6.63, + "learning_rate": 1.68385460693153e-05, + "loss": 0.5175, + "step": 7846, + "task_loss": 1.52130925655365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37797093391418457, + "epoch": 6.63, + "learning_rate": 1.6834319526627222e-05, + "loss": 0.4584, + "step": 7847, + "task_loss": 0.7377219796180725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21766868233680725, + "epoch": 6.63, + "learning_rate": 1.6830092983939138e-05, + "loss": 0.2589, + "step": 7848, + "task_loss": 0.33914804458618164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2780611515045166, + "epoch": 6.63, + "learning_rate": 1.6825866441251058e-05, + "loss": 0.4986, + "step": 7849, + "task_loss": 0.39501118659973145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49253904819488525, + "epoch": 6.64, + "learning_rate": 1.6821639898562978e-05, + "loss": 0.386, + "step": 7850, + "task_loss": 0.9855445623397827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6423546671867371, + "epoch": 6.64, + "learning_rate": 1.6817413355874894e-05, + "loss": 0.4266, + "step": 7851, + "task_loss": 0.6134361624717712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.746596097946167, + "epoch": 6.64, + "learning_rate": 1.6813186813186814e-05, + "loss": 0.4749, + "step": 7852, + "task_loss": 0.6976813673973083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35743534564971924, + "epoch": 6.64, + "learning_rate": 1.6808960270498733e-05, + "loss": 0.4166, + "step": 7853, + "task_loss": 0.6467171907424927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2795352041721344, + "epoch": 6.64, + "learning_rate": 1.680473372781065e-05, + "loss": 0.5168, + "step": 7854, + "task_loss": 0.031177816912531853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6888682246208191, + "epoch": 6.64, + "learning_rate": 1.6800507185122573e-05, + "loss": 0.4106, + "step": 7855, + "task_loss": 0.8098357319831848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41857612133026123, + "epoch": 6.64, + "learning_rate": 1.679628064243449e-05, + "loss": 0.4888, + "step": 7856, + "task_loss": 0.6957071423530579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28110942244529724, + "epoch": 6.64, + "learning_rate": 1.6792054099746406e-05, + "loss": 0.3718, + "step": 7857, + "task_loss": 0.4127071797847748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38301414251327515, + "epoch": 6.64, + "learning_rate": 1.678782755705833e-05, + "loss": 0.3611, + "step": 7858, + "task_loss": 0.17276017367839813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44969305396080017, + "epoch": 6.64, + "learning_rate": 1.6783601014370245e-05, + "loss": 0.4666, + "step": 7859, + "task_loss": 0.8074188232421875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29954028129577637, + "epoch": 6.64, + "learning_rate": 1.6779374471682165e-05, + "loss": 0.3689, + "step": 7860, + "task_loss": 0.6283911466598511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28224149346351624, + "epoch": 6.64, + "learning_rate": 1.6775147928994085e-05, + "loss": 0.3831, + "step": 7861, + "task_loss": 0.8562459349632263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2840210795402527, + "epoch": 6.65, + "learning_rate": 1.6770921386306e-05, + "loss": 0.3591, + "step": 7862, + "task_loss": 0.35653284192085266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3328137993812561, + "epoch": 6.65, + "learning_rate": 1.676669484361792e-05, + "loss": 0.4094, + "step": 7863, + "task_loss": 0.6902181506156921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27826401591300964, + "epoch": 6.65, + "learning_rate": 1.676246830092984e-05, + "loss": 0.482, + "step": 7864, + "task_loss": 0.2302171289920807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6427114605903625, + "epoch": 6.65, + "learning_rate": 1.6758241758241757e-05, + "loss": 0.5667, + "step": 7865, + "task_loss": 1.0369083881378174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.597230076789856, + "epoch": 6.65, + "learning_rate": 1.675401521555368e-05, + "loss": 0.5226, + "step": 7866, + "task_loss": 0.4389936625957489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22895793616771698, + "epoch": 6.65, + "learning_rate": 1.6749788672865596e-05, + "loss": 0.3498, + "step": 7867, + "task_loss": 0.23320049047470093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42503148317337036, + "epoch": 6.65, + "learning_rate": 1.6745562130177516e-05, + "loss": 0.492, + "step": 7868, + "task_loss": 0.36693093180656433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22606365382671356, + "epoch": 6.65, + "learning_rate": 1.6741335587489436e-05, + "loss": 0.3632, + "step": 7869, + "task_loss": 0.40443316102027893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3249947428703308, + "epoch": 6.65, + "learning_rate": 1.6737109044801352e-05, + "loss": 0.4099, + "step": 7870, + "task_loss": 0.7696169018745422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9552532434463501, + "epoch": 6.65, + "learning_rate": 1.6732882502113272e-05, + "loss": 0.515, + "step": 7871, + "task_loss": 1.6129182577133179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23829714953899384, + "epoch": 6.65, + "learning_rate": 1.672865595942519e-05, + "loss": 0.3475, + "step": 7872, + "task_loss": 0.470314621925354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42258942127227783, + "epoch": 6.65, + "learning_rate": 1.6724429416737108e-05, + "loss": 0.384, + "step": 7873, + "task_loss": 0.7635220289230347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2941585183143616, + "epoch": 6.66, + "learning_rate": 1.6720202874049028e-05, + "loss": 0.3763, + "step": 7874, + "task_loss": 0.16405069828033447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3593031167984009, + "epoch": 6.66, + "learning_rate": 1.6715976331360947e-05, + "loss": 0.4803, + "step": 7875, + "task_loss": 0.9557135105133057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3731909394264221, + "epoch": 6.66, + "learning_rate": 1.6711749788672867e-05, + "loss": 0.4268, + "step": 7876, + "task_loss": 0.22565065324306488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2306399643421173, + "epoch": 6.66, + "learning_rate": 1.6707523245984787e-05, + "loss": 0.4265, + "step": 7877, + "task_loss": 0.7818352580070496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2936776876449585, + "epoch": 6.66, + "learning_rate": 1.6703296703296703e-05, + "loss": 0.299, + "step": 7878, + "task_loss": 0.40420129895210266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24466188251972198, + "epoch": 6.66, + "learning_rate": 1.6699070160608623e-05, + "loss": 0.3941, + "step": 7879, + "task_loss": 0.1965845227241516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3178272247314453, + "epoch": 6.66, + "learning_rate": 1.6694843617920543e-05, + "loss": 0.3404, + "step": 7880, + "task_loss": 0.19740581512451172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32702547311782837, + "epoch": 6.66, + "learning_rate": 1.6690617075232462e-05, + "loss": 0.3724, + "step": 7881, + "task_loss": 1.2774770259857178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4005640149116516, + "epoch": 6.66, + "learning_rate": 1.668639053254438e-05, + "loss": 0.4585, + "step": 7882, + "task_loss": 0.7200934290885925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4011356234550476, + "epoch": 6.66, + "learning_rate": 1.66821639898563e-05, + "loss": 0.4334, + "step": 7883, + "task_loss": 0.2082877904176712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5061812400817871, + "epoch": 6.66, + "learning_rate": 1.6677937447168218e-05, + "loss": 0.4661, + "step": 7884, + "task_loss": 0.7716788649559021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41362491250038147, + "epoch": 6.66, + "learning_rate": 1.6673710904480135e-05, + "loss": 0.4154, + "step": 7885, + "task_loss": 0.4621753990650177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.627677321434021, + "epoch": 6.67, + "learning_rate": 1.6669484361792054e-05, + "loss": 0.5421, + "step": 7886, + "task_loss": 0.7942931056022644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7152157425880432, + "epoch": 6.67, + "learning_rate": 1.6665257819103974e-05, + "loss": 0.5423, + "step": 7887, + "task_loss": 1.3514974117279053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24486316740512848, + "epoch": 6.67, + "learning_rate": 1.6661031276415894e-05, + "loss": 0.3862, + "step": 7888, + "task_loss": 0.4385841488838196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49971505999565125, + "epoch": 6.67, + "learning_rate": 1.6656804733727814e-05, + "loss": 0.4765, + "step": 7889, + "task_loss": 1.7225147485733032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3985351324081421, + "epoch": 6.67, + "learning_rate": 1.665257819103973e-05, + "loss": 0.3644, + "step": 7890, + "task_loss": 0.5793612003326416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.217386856675148, + "epoch": 6.67, + "learning_rate": 1.664835164835165e-05, + "loss": 0.3296, + "step": 7891, + "task_loss": 0.04287043213844299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5360692739486694, + "epoch": 6.67, + "learning_rate": 1.664412510566357e-05, + "loss": 0.3718, + "step": 7892, + "task_loss": 0.4197237193584442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27151763439178467, + "epoch": 6.67, + "learning_rate": 1.6639898562975486e-05, + "loss": 0.422, + "step": 7893, + "task_loss": 0.4209800660610199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2818731367588043, + "epoch": 6.67, + "learning_rate": 1.6635672020287405e-05, + "loss": 0.4265, + "step": 7894, + "task_loss": 0.6921606659889221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4401148855686188, + "epoch": 6.67, + "learning_rate": 1.6631445477599325e-05, + "loss": 0.3912, + "step": 7895, + "task_loss": 0.24736544489860535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1674456000328064, + "epoch": 6.67, + "learning_rate": 1.662721893491124e-05, + "loss": 0.3296, + "step": 7896, + "task_loss": 0.5265105962753296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6385089159011841, + "epoch": 6.67, + "learning_rate": 1.6622992392223165e-05, + "loss": 0.433, + "step": 7897, + "task_loss": 1.0874245166778564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2909735441207886, + "epoch": 6.68, + "learning_rate": 1.661876584953508e-05, + "loss": 0.4436, + "step": 7898, + "task_loss": 0.3805181086063385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4249701201915741, + "epoch": 6.68, + "learning_rate": 1.6614539306846997e-05, + "loss": 0.377, + "step": 7899, + "task_loss": 0.5190927982330322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4515044689178467, + "epoch": 6.68, + "learning_rate": 1.661031276415892e-05, + "loss": 0.5262, + "step": 7900, + "task_loss": 0.1972443014383316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4924963116645813, + "epoch": 6.68, + "learning_rate": 1.6606086221470837e-05, + "loss": 0.4453, + "step": 7901, + "task_loss": 1.2507990598678589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4823368787765503, + "epoch": 6.68, + "learning_rate": 1.6601859678782757e-05, + "loss": 0.4629, + "step": 7902, + "task_loss": 0.6411387324333191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30803802609443665, + "epoch": 6.68, + "learning_rate": 1.6597633136094676e-05, + "loss": 0.3401, + "step": 7903, + "task_loss": 0.39228853583335876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36034291982650757, + "epoch": 6.68, + "learning_rate": 1.6593406593406593e-05, + "loss": 0.466, + "step": 7904, + "task_loss": 1.4728156328201294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36487507820129395, + "epoch": 6.68, + "learning_rate": 1.6589180050718516e-05, + "loss": 0.4961, + "step": 7905, + "task_loss": 0.1345130205154419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7971124649047852, + "epoch": 6.68, + "learning_rate": 1.6584953508030432e-05, + "loss": 0.5819, + "step": 7906, + "task_loss": 0.5571340322494507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33632469177246094, + "epoch": 6.68, + "learning_rate": 1.658072696534235e-05, + "loss": 0.3748, + "step": 7907, + "task_loss": 0.39115187525749207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.922744870185852, + "epoch": 6.68, + "learning_rate": 1.657650042265427e-05, + "loss": 0.5942, + "step": 7908, + "task_loss": 0.861556351184845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15169547498226166, + "epoch": 6.69, + "learning_rate": 1.6572273879966188e-05, + "loss": 0.4049, + "step": 7909, + "task_loss": 0.019703006371855736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47481006383895874, + "epoch": 6.69, + "learning_rate": 1.6568047337278108e-05, + "loss": 0.3535, + "step": 7910, + "task_loss": 1.2536118030548096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3274134397506714, + "epoch": 6.69, + "learning_rate": 1.6563820794590027e-05, + "loss": 0.3646, + "step": 7911, + "task_loss": 0.10222628712654114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25910577178001404, + "epoch": 6.69, + "learning_rate": 1.6559594251901944e-05, + "loss": 0.4001, + "step": 7912, + "task_loss": 0.648970365524292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34479060769081116, + "epoch": 6.69, + "learning_rate": 1.6555367709213864e-05, + "loss": 0.439, + "step": 7913, + "task_loss": 0.6855186820030212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38624656200408936, + "epoch": 6.69, + "learning_rate": 1.6551141166525783e-05, + "loss": 0.4484, + "step": 7914, + "task_loss": 0.5227249264717102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6417444944381714, + "epoch": 6.69, + "learning_rate": 1.65469146238377e-05, + "loss": 0.5027, + "step": 7915, + "task_loss": 1.4856117963790894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3462070822715759, + "epoch": 6.69, + "learning_rate": 1.654268808114962e-05, + "loss": 0.3855, + "step": 7916, + "task_loss": 0.33778905868530273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34173285961151123, + "epoch": 6.69, + "learning_rate": 1.653846153846154e-05, + "loss": 0.3932, + "step": 7917, + "task_loss": 0.35714563727378845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21421535313129425, + "epoch": 6.69, + "learning_rate": 1.653423499577346e-05, + "loss": 0.404, + "step": 7918, + "task_loss": 0.03819049149751663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5638412237167358, + "epoch": 6.69, + "learning_rate": 1.653000845308538e-05, + "loss": 0.394, + "step": 7919, + "task_loss": 0.46492114663124084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4318890869617462, + "epoch": 6.69, + "learning_rate": 1.6525781910397295e-05, + "loss": 0.3902, + "step": 7920, + "task_loss": 0.3392890989780426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37183642387390137, + "epoch": 6.7, + "learning_rate": 1.6521555367709215e-05, + "loss": 0.4021, + "step": 7921, + "task_loss": 0.48992031812667847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5553531050682068, + "epoch": 6.7, + "learning_rate": 1.6517328825021134e-05, + "loss": 0.5349, + "step": 7922, + "task_loss": 0.3624607026576996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7443002462387085, + "epoch": 6.7, + "learning_rate": 1.651310228233305e-05, + "loss": 0.4903, + "step": 7923, + "task_loss": 0.5412360429763794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3718640208244324, + "epoch": 6.7, + "learning_rate": 1.650887573964497e-05, + "loss": 0.4557, + "step": 7924, + "task_loss": 0.5478312969207764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32824039459228516, + "epoch": 6.7, + "learning_rate": 1.650464919695689e-05, + "loss": 0.4565, + "step": 7925, + "task_loss": 0.8614242672920227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3667967915534973, + "epoch": 6.7, + "learning_rate": 1.650042265426881e-05, + "loss": 0.4552, + "step": 7926, + "task_loss": 0.2793425917625427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44447726011276245, + "epoch": 6.7, + "learning_rate": 1.6496196111580726e-05, + "loss": 0.3646, + "step": 7927, + "task_loss": 0.7600813508033752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6184817552566528, + "epoch": 6.7, + "learning_rate": 1.6491969568892646e-05, + "loss": 0.4924, + "step": 7928, + "task_loss": 0.9183597564697266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41231435537338257, + "epoch": 6.7, + "learning_rate": 1.6487743026204566e-05, + "loss": 0.4237, + "step": 7929, + "task_loss": 0.3910095691680908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6822634935379028, + "epoch": 6.7, + "learning_rate": 1.6483516483516486e-05, + "loss": 0.5559, + "step": 7930, + "task_loss": 0.8809622526168823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33036908507347107, + "epoch": 6.7, + "learning_rate": 1.6479289940828402e-05, + "loss": 0.5491, + "step": 7931, + "task_loss": 0.13775520026683807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3999181389808655, + "epoch": 6.7, + "learning_rate": 1.647506339814032e-05, + "loss": 0.4929, + "step": 7932, + "task_loss": 0.5354138612747192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.211920365691185, + "epoch": 6.71, + "learning_rate": 1.647083685545224e-05, + "loss": 0.3587, + "step": 7933, + "task_loss": 0.6876026391983032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40062692761421204, + "epoch": 6.71, + "learning_rate": 1.646661031276416e-05, + "loss": 0.4132, + "step": 7934, + "task_loss": 0.34042972326278687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5441950559616089, + "epoch": 6.71, + "learning_rate": 1.6462383770076077e-05, + "loss": 0.5045, + "step": 7935, + "task_loss": 0.839040994644165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37070974707603455, + "epoch": 6.71, + "learning_rate": 1.6458157227387997e-05, + "loss": 0.4938, + "step": 7936, + "task_loss": 0.6015868782997131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4295511245727539, + "epoch": 6.71, + "learning_rate": 1.6453930684699917e-05, + "loss": 0.3948, + "step": 7937, + "task_loss": 0.8989497423171997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3794359564781189, + "epoch": 6.71, + "learning_rate": 1.6449704142011833e-05, + "loss": 0.3314, + "step": 7938, + "task_loss": 0.2520560026168823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4793817400932312, + "epoch": 6.71, + "learning_rate": 1.6445477599323756e-05, + "loss": 0.3654, + "step": 7939, + "task_loss": 0.8180513381958008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4951170086860657, + "epoch": 6.71, + "learning_rate": 1.6441251056635673e-05, + "loss": 0.4887, + "step": 7940, + "task_loss": 0.9944536685943604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25072160363197327, + "epoch": 6.71, + "learning_rate": 1.6437024513947592e-05, + "loss": 0.5373, + "step": 7941, + "task_loss": 0.6939756870269775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24348419904708862, + "epoch": 6.71, + "learning_rate": 1.6432797971259512e-05, + "loss": 0.2321, + "step": 7942, + "task_loss": 0.40479740500450134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5575716495513916, + "epoch": 6.71, + "learning_rate": 1.642857142857143e-05, + "loss": 0.3591, + "step": 7943, + "task_loss": 0.5743822455406189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5072205662727356, + "epoch": 6.71, + "learning_rate": 1.642434488588335e-05, + "loss": 0.4241, + "step": 7944, + "task_loss": 0.405695378780365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2288137674331665, + "epoch": 6.72, + "learning_rate": 1.6420118343195268e-05, + "loss": 0.3291, + "step": 7945, + "task_loss": 1.074151873588562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47672244906425476, + "epoch": 6.72, + "learning_rate": 1.6415891800507184e-05, + "loss": 0.4439, + "step": 7946, + "task_loss": 1.2125324010849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32044488191604614, + "epoch": 6.72, + "learning_rate": 1.6411665257819108e-05, + "loss": 0.465, + "step": 7947, + "task_loss": 0.4071246385574341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5282348394393921, + "epoch": 6.72, + "learning_rate": 1.6407438715131024e-05, + "loss": 0.4771, + "step": 7948, + "task_loss": 1.1675145626068115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22832587361335754, + "epoch": 6.72, + "learning_rate": 1.640321217244294e-05, + "loss": 0.2561, + "step": 7949, + "task_loss": 0.44228026270866394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4511672854423523, + "epoch": 6.72, + "learning_rate": 1.6398985629754863e-05, + "loss": 0.4778, + "step": 7950, + "task_loss": 0.6656588912010193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6070488095283508, + "epoch": 6.72, + "learning_rate": 1.639475908706678e-05, + "loss": 0.4524, + "step": 7951, + "task_loss": 0.8925662040710449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45406103134155273, + "epoch": 6.72, + "learning_rate": 1.63905325443787e-05, + "loss": 0.3515, + "step": 7952, + "task_loss": 0.5755024552345276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38286399841308594, + "epoch": 6.72, + "learning_rate": 1.638630600169062e-05, + "loss": 0.5273, + "step": 7953, + "task_loss": 0.2970421612262726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22014494240283966, + "epoch": 6.72, + "learning_rate": 1.6382079459002536e-05, + "loss": 0.2855, + "step": 7954, + "task_loss": 0.5068712830543518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4310670495033264, + "epoch": 6.72, + "learning_rate": 1.6377852916314455e-05, + "loss": 0.5253, + "step": 7955, + "task_loss": 0.8217558860778809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4381455183029175, + "epoch": 6.72, + "learning_rate": 1.6373626373626375e-05, + "loss": 0.4016, + "step": 7956, + "task_loss": 0.6056846380233765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3161821663379669, + "epoch": 6.73, + "learning_rate": 1.636939983093829e-05, + "loss": 0.5172, + "step": 7957, + "task_loss": 0.260733962059021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44755077362060547, + "epoch": 6.73, + "learning_rate": 1.6365173288250214e-05, + "loss": 0.4121, + "step": 7958, + "task_loss": 0.9655861258506775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5423612594604492, + "epoch": 6.73, + "learning_rate": 1.636094674556213e-05, + "loss": 0.4104, + "step": 7959, + "task_loss": 0.8968042135238647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3530900478363037, + "epoch": 6.73, + "learning_rate": 1.6356720202874047e-05, + "loss": 0.412, + "step": 7960, + "task_loss": 0.18608194589614868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2978460192680359, + "epoch": 6.73, + "learning_rate": 1.635249366018597e-05, + "loss": 0.3943, + "step": 7961, + "task_loss": 0.2696540951728821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3914908170700073, + "epoch": 6.73, + "learning_rate": 1.6348267117497887e-05, + "loss": 0.5007, + "step": 7962, + "task_loss": 0.10750412940979004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.265880286693573, + "epoch": 6.73, + "learning_rate": 1.6344040574809806e-05, + "loss": 0.3117, + "step": 7963, + "task_loss": 0.16568820178508759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6397131681442261, + "epoch": 6.73, + "learning_rate": 1.6339814032121726e-05, + "loss": 0.3453, + "step": 7964, + "task_loss": 0.10658658295869827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2626704275608063, + "epoch": 6.73, + "learning_rate": 1.6335587489433643e-05, + "loss": 0.4288, + "step": 7965, + "task_loss": 0.2992420494556427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23224788904190063, + "epoch": 6.73, + "learning_rate": 1.6331360946745562e-05, + "loss": 0.4971, + "step": 7966, + "task_loss": 0.3319239914417267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4272986054420471, + "epoch": 6.73, + "learning_rate": 1.6327134404057482e-05, + "loss": 0.3209, + "step": 7967, + "task_loss": 0.5666282176971436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4463226795196533, + "epoch": 6.73, + "learning_rate": 1.6322907861369402e-05, + "loss": 0.514, + "step": 7968, + "task_loss": 1.488161325454712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5011719465255737, + "epoch": 6.74, + "learning_rate": 1.631868131868132e-05, + "loss": 0.4265, + "step": 7969, + "task_loss": 0.24966245889663696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5733376145362854, + "epoch": 6.74, + "learning_rate": 1.6314454775993238e-05, + "loss": 0.6238, + "step": 7970, + "task_loss": 1.1430233716964722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3056926131248474, + "epoch": 6.74, + "learning_rate": 1.6310228233305158e-05, + "loss": 0.5008, + "step": 7971, + "task_loss": 0.3297462463378906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3262958228588104, + "epoch": 6.74, + "learning_rate": 1.6306001690617077e-05, + "loss": 0.4637, + "step": 7972, + "task_loss": 0.16790935397148132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.347761869430542, + "epoch": 6.74, + "learning_rate": 1.6301775147928994e-05, + "loss": 0.3851, + "step": 7973, + "task_loss": 0.8909550309181213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4962056577205658, + "epoch": 6.74, + "learning_rate": 1.6297548605240913e-05, + "loss": 0.64, + "step": 7974, + "task_loss": 0.893773078918457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4327651858329773, + "epoch": 6.74, + "learning_rate": 1.6293322062552833e-05, + "loss": 0.5309, + "step": 7975, + "task_loss": 0.3724449574947357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2775087356567383, + "epoch": 6.74, + "learning_rate": 1.6289095519864753e-05, + "loss": 0.3464, + "step": 7976, + "task_loss": 0.48715224862098694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3297877013683319, + "epoch": 6.74, + "learning_rate": 1.628486897717667e-05, + "loss": 0.3777, + "step": 7977, + "task_loss": 0.24879519641399384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4638915956020355, + "epoch": 6.74, + "learning_rate": 1.628064243448859e-05, + "loss": 0.43, + "step": 7978, + "task_loss": 0.5176271200180054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3891712427139282, + "epoch": 6.74, + "learning_rate": 1.627641589180051e-05, + "loss": 0.5269, + "step": 7979, + "task_loss": 1.4875051975250244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28974100947380066, + "epoch": 6.75, + "learning_rate": 1.6272189349112425e-05, + "loss": 0.4091, + "step": 7980, + "task_loss": 0.6200781464576721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7219356298446655, + "epoch": 6.75, + "learning_rate": 1.6267962806424345e-05, + "loss": 0.4664, + "step": 7981, + "task_loss": 0.5566542744636536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4468679130077362, + "epoch": 6.75, + "learning_rate": 1.6263736263736265e-05, + "loss": 0.4221, + "step": 7982, + "task_loss": 0.8056275844573975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36290279030799866, + "epoch": 6.75, + "learning_rate": 1.6259509721048184e-05, + "loss": 0.6668, + "step": 7983, + "task_loss": 1.2422292232513428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.317783385515213, + "epoch": 6.75, + "learning_rate": 1.6255283178360104e-05, + "loss": 0.4635, + "step": 7984, + "task_loss": 0.051301438361406326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39396974444389343, + "epoch": 6.75, + "learning_rate": 1.625105663567202e-05, + "loss": 0.4588, + "step": 7985, + "task_loss": 0.5450160503387451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32647043466567993, + "epoch": 6.75, + "learning_rate": 1.624683009298394e-05, + "loss": 0.3045, + "step": 7986, + "task_loss": 0.5163854956626892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3099084496498108, + "epoch": 6.75, + "learning_rate": 1.624260355029586e-05, + "loss": 0.4896, + "step": 7987, + "task_loss": 0.47284674644470215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5949761867523193, + "epoch": 6.75, + "learning_rate": 1.6238377007607776e-05, + "loss": 0.5508, + "step": 7988, + "task_loss": 0.40143948793411255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35647398233413696, + "epoch": 6.75, + "learning_rate": 1.6234150464919696e-05, + "loss": 0.4673, + "step": 7989, + "task_loss": 0.5432431101799011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5430876016616821, + "epoch": 6.75, + "learning_rate": 1.6229923922231616e-05, + "loss": 0.4804, + "step": 7990, + "task_loss": 0.8444298505783081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.650983452796936, + "epoch": 6.75, + "learning_rate": 1.6225697379543532e-05, + "loss": 0.4648, + "step": 7991, + "task_loss": 0.6564168930053711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40401947498321533, + "epoch": 6.76, + "learning_rate": 1.6221470836855455e-05, + "loss": 0.3705, + "step": 7992, + "task_loss": 0.5064387917518616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3301240801811218, + "epoch": 6.76, + "learning_rate": 1.621724429416737e-05, + "loss": 0.3703, + "step": 7993, + "task_loss": 0.191017284989357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16784796118736267, + "epoch": 6.76, + "learning_rate": 1.621301775147929e-05, + "loss": 0.4085, + "step": 7994, + "task_loss": 0.013024200685322285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5615407824516296, + "epoch": 6.76, + "learning_rate": 1.620879120879121e-05, + "loss": 0.4057, + "step": 7995, + "task_loss": 1.4824144840240479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6682569980621338, + "epoch": 6.76, + "learning_rate": 1.6204564666103127e-05, + "loss": 0.4554, + "step": 7996, + "task_loss": 1.2234739065170288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5098133087158203, + "epoch": 6.76, + "learning_rate": 1.6200338123415047e-05, + "loss": 0.4248, + "step": 7997, + "task_loss": 1.1086411476135254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49236127734184265, + "epoch": 6.76, + "learning_rate": 1.6196111580726967e-05, + "loss": 0.3804, + "step": 7998, + "task_loss": 1.1716437339782715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4926813840866089, + "epoch": 6.76, + "learning_rate": 1.6191885038038883e-05, + "loss": 0.4639, + "step": 7999, + "task_loss": 0.4787040650844574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2839115858078003, + "epoch": 6.76, + "learning_rate": 1.6187658495350806e-05, + "loss": 0.4997, + "step": 8000, + "task_loss": 0.6261990070343018 + }, + { + "epoch": 6.76, + "eval_accuracy": 0.9125544554455446, + "eval_loss": 0.27955999970436096, + "eval_runtime": 228.9141, + "eval_samples_per_second": 110.303, + "eval_steps_per_second": 0.865, + "step": 8000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16095927357673645, + "epoch": 6.76, + "learning_rate": 1.6183431952662723e-05, + "loss": 0.4743, + "step": 8001, + "task_loss": 0.2690582871437073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5012394189834595, + "epoch": 6.76, + "learning_rate": 1.617920540997464e-05, + "loss": 0.3123, + "step": 8002, + "task_loss": 0.49093863368034363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6563464999198914, + "epoch": 6.76, + "learning_rate": 1.6174978867286562e-05, + "loss": 0.5459, + "step": 8003, + "task_loss": 1.0457136631011963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3518804907798767, + "epoch": 6.77, + "learning_rate": 1.617075232459848e-05, + "loss": 0.3818, + "step": 8004, + "task_loss": 0.3805527985095978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6097347140312195, + "epoch": 6.77, + "learning_rate": 1.6166525781910398e-05, + "loss": 0.4862, + "step": 8005, + "task_loss": 1.503118634223938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38980960845947266, + "epoch": 6.77, + "learning_rate": 1.6162299239222318e-05, + "loss": 0.4604, + "step": 8006, + "task_loss": 0.9634042382240295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4452405273914337, + "epoch": 6.77, + "learning_rate": 1.6158072696534234e-05, + "loss": 0.4147, + "step": 8007, + "task_loss": 0.622454822063446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8142214417457581, + "epoch": 6.77, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.4728, + "step": 8008, + "task_loss": 0.5377688407897949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40532761812210083, + "epoch": 6.77, + "learning_rate": 1.6149619611158074e-05, + "loss": 0.4213, + "step": 8009, + "task_loss": 0.4275454580783844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3079216778278351, + "epoch": 6.77, + "learning_rate": 1.614539306846999e-05, + "loss": 0.4219, + "step": 8010, + "task_loss": 0.34865784645080566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27649450302124023, + "epoch": 6.77, + "learning_rate": 1.6141166525781913e-05, + "loss": 0.3931, + "step": 8011, + "task_loss": 0.581843912601471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36940401792526245, + "epoch": 6.77, + "learning_rate": 1.613693998309383e-05, + "loss": 0.4218, + "step": 8012, + "task_loss": 1.0136339664459229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29815709590911865, + "epoch": 6.77, + "learning_rate": 1.613271344040575e-05, + "loss": 0.5031, + "step": 8013, + "task_loss": 0.7317314743995667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4165448546409607, + "epoch": 6.77, + "learning_rate": 1.612848689771767e-05, + "loss": 0.3401, + "step": 8014, + "task_loss": 0.8661158084869385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2309708148241043, + "epoch": 6.77, + "learning_rate": 1.6124260355029585e-05, + "loss": 0.4438, + "step": 8015, + "task_loss": 0.30182480812072754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7776057720184326, + "epoch": 6.78, + "learning_rate": 1.6120033812341505e-05, + "loss": 0.4979, + "step": 8016, + "task_loss": 0.6237890720367432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34678593277931213, + "epoch": 6.78, + "learning_rate": 1.6115807269653425e-05, + "loss": 0.4933, + "step": 8017, + "task_loss": 0.5657974481582642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44178688526153564, + "epoch": 6.78, + "learning_rate": 1.611158072696534e-05, + "loss": 0.5281, + "step": 8018, + "task_loss": 1.1523361206054688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5925604701042175, + "epoch": 6.78, + "learning_rate": 1.610735418427726e-05, + "loss": 0.4988, + "step": 8019, + "task_loss": 0.328058123588562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4390653967857361, + "epoch": 6.78, + "learning_rate": 1.610312764158918e-05, + "loss": 0.4739, + "step": 8020, + "task_loss": 0.5127101540565491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5726560354232788, + "epoch": 6.78, + "learning_rate": 1.60989010989011e-05, + "loss": 0.5856, + "step": 8021, + "task_loss": 0.8196535110473633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5633814930915833, + "epoch": 6.78, + "learning_rate": 1.609467455621302e-05, + "loss": 0.4737, + "step": 8022, + "task_loss": 0.36884334683418274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3535408675670624, + "epoch": 6.78, + "learning_rate": 1.6090448013524937e-05, + "loss": 0.3469, + "step": 8023, + "task_loss": 0.10423684865236282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16628949344158173, + "epoch": 6.78, + "learning_rate": 1.6086221470836856e-05, + "loss": 0.3359, + "step": 8024, + "task_loss": 0.5117090344429016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29161396622657776, + "epoch": 6.78, + "learning_rate": 1.6081994928148776e-05, + "loss": 0.3325, + "step": 8025, + "task_loss": 0.6183074116706848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.390272319316864, + "epoch": 6.78, + "learning_rate": 1.6077768385460696e-05, + "loss": 0.3471, + "step": 8026, + "task_loss": 0.5201712846755981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5231207609176636, + "epoch": 6.78, + "learning_rate": 1.6073541842772612e-05, + "loss": 0.4771, + "step": 8027, + "task_loss": 0.2206430435180664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35198789834976196, + "epoch": 6.79, + "learning_rate": 1.6069315300084532e-05, + "loss": 0.4537, + "step": 8028, + "task_loss": 0.5128129720687866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33014366030693054, + "epoch": 6.79, + "learning_rate": 1.606508875739645e-05, + "loss": 0.3205, + "step": 8029, + "task_loss": 0.8191583752632141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3232291340827942, + "epoch": 6.79, + "learning_rate": 1.6060862214708368e-05, + "loss": 0.3346, + "step": 8030, + "task_loss": 0.35558173060417175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3934454917907715, + "epoch": 6.79, + "learning_rate": 1.6056635672020288e-05, + "loss": 0.4103, + "step": 8031, + "task_loss": 0.4295128881931305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21254229545593262, + "epoch": 6.79, + "learning_rate": 1.6052409129332207e-05, + "loss": 0.3439, + "step": 8032, + "task_loss": 0.6510332226753235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23942747712135315, + "epoch": 6.79, + "learning_rate": 1.6048182586644127e-05, + "loss": 0.3563, + "step": 8033, + "task_loss": 0.34563490748405457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19090262055397034, + "epoch": 6.79, + "learning_rate": 1.6043956043956047e-05, + "loss": 0.4454, + "step": 8034, + "task_loss": 0.09281444549560547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2765709161758423, + "epoch": 6.79, + "learning_rate": 1.6039729501267963e-05, + "loss": 0.3416, + "step": 8035, + "task_loss": 0.5890077948570251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4097260534763336, + "epoch": 6.79, + "learning_rate": 1.6035502958579883e-05, + "loss": 0.4477, + "step": 8036, + "task_loss": 0.6353673934936523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3127579391002655, + "epoch": 6.79, + "learning_rate": 1.6031276415891803e-05, + "loss": 0.469, + "step": 8037, + "task_loss": 0.47293365001678467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5280681848526001, + "epoch": 6.79, + "learning_rate": 1.602704987320372e-05, + "loss": 0.5216, + "step": 8038, + "task_loss": 0.42518875002861023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34957584738731384, + "epoch": 6.79, + "learning_rate": 1.602282333051564e-05, + "loss": 0.4564, + "step": 8039, + "task_loss": 0.7410932183265686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4208882451057434, + "epoch": 6.8, + "learning_rate": 1.601859678782756e-05, + "loss": 0.4007, + "step": 8040, + "task_loss": 0.530224621295929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43443965911865234, + "epoch": 6.8, + "learning_rate": 1.6014370245139475e-05, + "loss": 0.4637, + "step": 8041, + "task_loss": 0.21614809334278107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3379107713699341, + "epoch": 6.8, + "learning_rate": 1.6010143702451398e-05, + "loss": 0.4029, + "step": 8042, + "task_loss": 0.35711783170700073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19834917783737183, + "epoch": 6.8, + "learning_rate": 1.6005917159763314e-05, + "loss": 0.3515, + "step": 8043, + "task_loss": 0.10279317200183868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23508857190608978, + "epoch": 6.8, + "learning_rate": 1.600169061707523e-05, + "loss": 0.3821, + "step": 8044, + "task_loss": 0.4397507607936859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3264158368110657, + "epoch": 6.8, + "learning_rate": 1.5997464074387154e-05, + "loss": 0.3823, + "step": 8045, + "task_loss": 0.7231268286705017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48114579916000366, + "epoch": 6.8, + "learning_rate": 1.599323753169907e-05, + "loss": 0.4898, + "step": 8046, + "task_loss": 0.8730953335762024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.412562757730484, + "epoch": 6.8, + "learning_rate": 1.598901098901099e-05, + "loss": 0.3786, + "step": 8047, + "task_loss": 0.20533229410648346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37530413269996643, + "epoch": 6.8, + "learning_rate": 1.598478444632291e-05, + "loss": 0.4339, + "step": 8048, + "task_loss": 0.11649879068136215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3089049458503723, + "epoch": 6.8, + "learning_rate": 1.5980557903634826e-05, + "loss": 0.3362, + "step": 8049, + "task_loss": 0.043812721967697144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3435078263282776, + "epoch": 6.8, + "learning_rate": 1.5976331360946746e-05, + "loss": 0.4923, + "step": 8050, + "task_loss": 0.7447821497917175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3593711853027344, + "epoch": 6.81, + "learning_rate": 1.5972104818258665e-05, + "loss": 0.3428, + "step": 8051, + "task_loss": 0.29679423570632935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6199342012405396, + "epoch": 6.81, + "learning_rate": 1.5967878275570582e-05, + "loss": 0.5188, + "step": 8052, + "task_loss": 1.025733232498169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4931357800960541, + "epoch": 6.81, + "learning_rate": 1.5963651732882505e-05, + "loss": 0.3856, + "step": 8053, + "task_loss": 0.11635943502187729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.296481192111969, + "epoch": 6.81, + "learning_rate": 1.595942519019442e-05, + "loss": 0.3528, + "step": 8054, + "task_loss": 0.45406460762023926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3673345148563385, + "epoch": 6.81, + "learning_rate": 1.5955198647506338e-05, + "loss": 0.3677, + "step": 8055, + "task_loss": 0.7397918701171875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3369954824447632, + "epoch": 6.81, + "learning_rate": 1.595097210481826e-05, + "loss": 0.3799, + "step": 8056, + "task_loss": 0.7255717515945435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4376756548881531, + "epoch": 6.81, + "learning_rate": 1.5946745562130177e-05, + "loss": 0.56, + "step": 8057, + "task_loss": 1.481479287147522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3143744468688965, + "epoch": 6.81, + "learning_rate": 1.5942519019442097e-05, + "loss": 0.3216, + "step": 8058, + "task_loss": 0.8532222509384155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3906935155391693, + "epoch": 6.81, + "learning_rate": 1.5938292476754017e-05, + "loss": 0.513, + "step": 8059, + "task_loss": 0.2701754868030548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5425615310668945, + "epoch": 6.81, + "learning_rate": 1.5934065934065933e-05, + "loss": 0.5498, + "step": 8060, + "task_loss": 0.2165064811706543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32677197456359863, + "epoch": 6.81, + "learning_rate": 1.5929839391377853e-05, + "loss": 0.3415, + "step": 8061, + "task_loss": 0.22302232682704926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4614410996437073, + "epoch": 6.81, + "learning_rate": 1.5925612848689772e-05, + "loss": 0.4994, + "step": 8062, + "task_loss": 0.48615002632141113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35435038805007935, + "epoch": 6.82, + "learning_rate": 1.5921386306001692e-05, + "loss": 0.4701, + "step": 8063, + "task_loss": 0.747526228427887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3174985349178314, + "epoch": 6.82, + "learning_rate": 1.5917159763313612e-05, + "loss": 0.3389, + "step": 8064, + "task_loss": 0.32501181960105896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28691577911376953, + "epoch": 6.82, + "learning_rate": 1.5912933220625528e-05, + "loss": 0.3381, + "step": 8065, + "task_loss": 0.2907180190086365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42154568433761597, + "epoch": 6.82, + "learning_rate": 1.5908706677937448e-05, + "loss": 0.361, + "step": 8066, + "task_loss": 0.3193168342113495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3747527599334717, + "epoch": 6.82, + "learning_rate": 1.5904480135249368e-05, + "loss": 0.4731, + "step": 8067, + "task_loss": 1.0634963512420654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4335249662399292, + "epoch": 6.82, + "learning_rate": 1.5900253592561284e-05, + "loss": 0.502, + "step": 8068, + "task_loss": 1.1207228899002075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3604487180709839, + "epoch": 6.82, + "learning_rate": 1.5896027049873204e-05, + "loss": 0.4875, + "step": 8069, + "task_loss": 0.6643565893173218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6618340015411377, + "epoch": 6.82, + "learning_rate": 1.5891800507185124e-05, + "loss": 0.5111, + "step": 8070, + "task_loss": 0.42158302664756775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8793841600418091, + "epoch": 6.82, + "learning_rate": 1.5887573964497043e-05, + "loss": 0.5906, + "step": 8071, + "task_loss": 1.0742409229278564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4195050597190857, + "epoch": 6.82, + "learning_rate": 1.588334742180896e-05, + "loss": 0.3787, + "step": 8072, + "task_loss": 0.4573048949241638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5253006219863892, + "epoch": 6.82, + "learning_rate": 1.587912087912088e-05, + "loss": 0.3946, + "step": 8073, + "task_loss": 1.1194708347320557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5496973395347595, + "epoch": 6.82, + "learning_rate": 1.58748943364328e-05, + "loss": 0.462, + "step": 8074, + "task_loss": 1.2671682834625244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17610062658786774, + "epoch": 6.83, + "learning_rate": 1.587066779374472e-05, + "loss": 0.3822, + "step": 8075, + "task_loss": 0.14079108834266663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31717604398727417, + "epoch": 6.83, + "learning_rate": 1.5866441251056635e-05, + "loss": 0.331, + "step": 8076, + "task_loss": 0.35425427556037903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3246527314186096, + "epoch": 6.83, + "learning_rate": 1.5862214708368555e-05, + "loss": 0.4595, + "step": 8077, + "task_loss": 0.34041333198547363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21272632479667664, + "epoch": 6.83, + "learning_rate": 1.5857988165680475e-05, + "loss": 0.3478, + "step": 8078, + "task_loss": 0.20563143491744995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48189061880111694, + "epoch": 6.83, + "learning_rate": 1.5853761622992394e-05, + "loss": 0.3814, + "step": 8079, + "task_loss": 0.7871041297912598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40245136618614197, + "epoch": 6.83, + "learning_rate": 1.584953508030431e-05, + "loss": 0.3914, + "step": 8080, + "task_loss": 1.0009925365447998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24773475527763367, + "epoch": 6.83, + "learning_rate": 1.584530853761623e-05, + "loss": 0.4022, + "step": 8081, + "task_loss": 0.5443968772888184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2241898626089096, + "epoch": 6.83, + "learning_rate": 1.584108199492815e-05, + "loss": 0.4694, + "step": 8082, + "task_loss": 0.055436864495277405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39965224266052246, + "epoch": 6.83, + "learning_rate": 1.5836855452240067e-05, + "loss": 0.3547, + "step": 8083, + "task_loss": 1.2731119394302368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3657926619052887, + "epoch": 6.83, + "learning_rate": 1.5832628909551986e-05, + "loss": 0.4419, + "step": 8084, + "task_loss": 1.1267801523208618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5794357061386108, + "epoch": 6.83, + "learning_rate": 1.5828402366863906e-05, + "loss": 0.5081, + "step": 8085, + "task_loss": 0.4853847622871399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42030853033065796, + "epoch": 6.83, + "learning_rate": 1.5824175824175826e-05, + "loss": 0.4069, + "step": 8086, + "task_loss": 0.2855437397956848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.527397871017456, + "epoch": 6.84, + "learning_rate": 1.5819949281487746e-05, + "loss": 0.5237, + "step": 8087, + "task_loss": 0.38615819811820984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.194346085190773, + "epoch": 6.84, + "learning_rate": 1.5815722738799662e-05, + "loss": 0.3106, + "step": 8088, + "task_loss": 0.2588060796260834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4728699326515198, + "epoch": 6.84, + "learning_rate": 1.581149619611158e-05, + "loss": 0.3948, + "step": 8089, + "task_loss": 0.4079889953136444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20930883288383484, + "epoch": 6.84, + "learning_rate": 1.58072696534235e-05, + "loss": 0.3449, + "step": 8090, + "task_loss": 0.3875212073326111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3761861324310303, + "epoch": 6.84, + "learning_rate": 1.5803043110735418e-05, + "loss": 0.3651, + "step": 8091, + "task_loss": 0.6081957817077637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34621936082839966, + "epoch": 6.84, + "learning_rate": 1.579881656804734e-05, + "loss": 0.4026, + "step": 8092, + "task_loss": 0.9961043000221252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5572715997695923, + "epoch": 6.84, + "learning_rate": 1.5794590025359257e-05, + "loss": 0.3983, + "step": 8093, + "task_loss": 1.1120784282684326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14668989181518555, + "epoch": 6.84, + "learning_rate": 1.5790363482671174e-05, + "loss": 0.386, + "step": 8094, + "task_loss": 0.07613242417573929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33486998081207275, + "epoch": 6.84, + "learning_rate": 1.5786136939983097e-05, + "loss": 0.3491, + "step": 8095, + "task_loss": 0.37807804346084595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5577993988990784, + "epoch": 6.84, + "learning_rate": 1.5781910397295013e-05, + "loss": 0.3665, + "step": 8096, + "task_loss": 0.5054947733879089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37743669748306274, + "epoch": 6.84, + "learning_rate": 1.577768385460693e-05, + "loss": 0.4887, + "step": 8097, + "task_loss": 0.568747341632843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3052997589111328, + "epoch": 6.84, + "learning_rate": 1.5773457311918853e-05, + "loss": 0.3749, + "step": 8098, + "task_loss": 0.5515596270561218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23653361201286316, + "epoch": 6.85, + "learning_rate": 1.576923076923077e-05, + "loss": 0.3166, + "step": 8099, + "task_loss": 0.20517557859420776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43176761269569397, + "epoch": 6.85, + "learning_rate": 1.576500422654269e-05, + "loss": 0.4632, + "step": 8100, + "task_loss": 0.2481747716665268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31201624870300293, + "epoch": 6.85, + "learning_rate": 1.576077768385461e-05, + "loss": 0.5004, + "step": 8101, + "task_loss": 0.9064463973045349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3761563003063202, + "epoch": 6.85, + "learning_rate": 1.5756551141166525e-05, + "loss": 0.4795, + "step": 8102, + "task_loss": 0.09617482125759125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33675050735473633, + "epoch": 6.85, + "learning_rate": 1.5752324598478448e-05, + "loss": 0.5245, + "step": 8103, + "task_loss": 0.49845263361930847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2771129012107849, + "epoch": 6.85, + "learning_rate": 1.5748098055790364e-05, + "loss": 0.3804, + "step": 8104, + "task_loss": 0.3021808862686157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29359641671180725, + "epoch": 6.85, + "learning_rate": 1.574387151310228e-05, + "loss": 0.3325, + "step": 8105, + "task_loss": 0.5776968002319336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3664187788963318, + "epoch": 6.85, + "learning_rate": 1.5739644970414204e-05, + "loss": 0.3538, + "step": 8106, + "task_loss": 0.8243472576141357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3749851584434509, + "epoch": 6.85, + "learning_rate": 1.573541842772612e-05, + "loss": 0.3746, + "step": 8107, + "task_loss": 2.871325731277466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30693602561950684, + "epoch": 6.85, + "learning_rate": 1.573119188503804e-05, + "loss": 0.3447, + "step": 8108, + "task_loss": 0.6457070708274841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5174764394760132, + "epoch": 6.85, + "learning_rate": 1.572696534234996e-05, + "loss": 0.479, + "step": 8109, + "task_loss": 0.482393741607666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30791014432907104, + "epoch": 6.85, + "learning_rate": 1.5722738799661876e-05, + "loss": 0.4761, + "step": 8110, + "task_loss": 0.7153225541114807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4221128225326538, + "epoch": 6.86, + "learning_rate": 1.5718512256973796e-05, + "loss": 0.4195, + "step": 8111, + "task_loss": 0.7556651830673218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17395806312561035, + "epoch": 6.86, + "learning_rate": 1.5714285714285715e-05, + "loss": 0.3017, + "step": 8112, + "task_loss": 0.25805196166038513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33352524042129517, + "epoch": 6.86, + "learning_rate": 1.571005917159763e-05, + "loss": 0.4653, + "step": 8113, + "task_loss": 0.5006245374679565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2153080403804779, + "epoch": 6.86, + "learning_rate": 1.570583262890955e-05, + "loss": 0.4075, + "step": 8114, + "task_loss": 0.722095251083374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5717830657958984, + "epoch": 6.86, + "learning_rate": 1.570160608622147e-05, + "loss": 0.4074, + "step": 8115, + "task_loss": 0.9062063694000244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5659950971603394, + "epoch": 6.86, + "learning_rate": 1.569737954353339e-05, + "loss": 0.467, + "step": 8116, + "task_loss": 0.21606139838695526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37353789806365967, + "epoch": 6.86, + "learning_rate": 1.569315300084531e-05, + "loss": 0.4096, + "step": 8117, + "task_loss": 0.9258955717086792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2989078462123871, + "epoch": 6.86, + "learning_rate": 1.5688926458157227e-05, + "loss": 0.2604, + "step": 8118, + "task_loss": 1.034909725189209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4253457188606262, + "epoch": 6.86, + "learning_rate": 1.5684699915469147e-05, + "loss": 0.4764, + "step": 8119, + "task_loss": 1.0334888696670532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23852607607841492, + "epoch": 6.86, + "learning_rate": 1.5680473372781066e-05, + "loss": 0.5042, + "step": 8120, + "task_loss": 0.6262539029121399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47986072301864624, + "epoch": 6.86, + "learning_rate": 1.5676246830092986e-05, + "loss": 0.4565, + "step": 8121, + "task_loss": 0.4495536684989929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32538288831710815, + "epoch": 6.87, + "learning_rate": 1.5672020287404903e-05, + "loss": 0.4453, + "step": 8122, + "task_loss": 0.6178637742996216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43225064873695374, + "epoch": 6.87, + "learning_rate": 1.5667793744716822e-05, + "loss": 0.3901, + "step": 8123, + "task_loss": 0.3126216530799866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6894912719726562, + "epoch": 6.87, + "learning_rate": 1.5663567202028742e-05, + "loss": 0.4547, + "step": 8124, + "task_loss": 0.9569483399391174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42321085929870605, + "epoch": 6.87, + "learning_rate": 1.565934065934066e-05, + "loss": 0.4053, + "step": 8125, + "task_loss": 0.7773823738098145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2936991751194, + "epoch": 6.87, + "learning_rate": 1.5655114116652578e-05, + "loss": 0.4166, + "step": 8126, + "task_loss": 0.8166089057922363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4077661335468292, + "epoch": 6.87, + "learning_rate": 1.5650887573964498e-05, + "loss": 0.5046, + "step": 8127, + "task_loss": 0.6761220693588257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31363582611083984, + "epoch": 6.87, + "learning_rate": 1.5646661031276418e-05, + "loss": 0.4506, + "step": 8128, + "task_loss": 0.21208417415618896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5668113231658936, + "epoch": 6.87, + "learning_rate": 1.5642434488588337e-05, + "loss": 0.4882, + "step": 8129, + "task_loss": 0.5586158037185669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4881879687309265, + "epoch": 6.87, + "learning_rate": 1.5638207945900254e-05, + "loss": 0.4445, + "step": 8130, + "task_loss": 0.04391130805015564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2599678337574005, + "epoch": 6.87, + "learning_rate": 1.5633981403212173e-05, + "loss": 0.3197, + "step": 8131, + "task_loss": 0.2690867483615875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4973011016845703, + "epoch": 6.87, + "learning_rate": 1.5629754860524093e-05, + "loss": 0.5662, + "step": 8132, + "task_loss": 0.9081259369850159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5619357228279114, + "epoch": 6.87, + "learning_rate": 1.562552831783601e-05, + "loss": 0.4176, + "step": 8133, + "task_loss": 0.23953431844711304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40733802318573, + "epoch": 6.88, + "learning_rate": 1.562130177514793e-05, + "loss": 0.3983, + "step": 8134, + "task_loss": 0.8377662897109985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4572140574455261, + "epoch": 6.88, + "learning_rate": 1.561707523245985e-05, + "loss": 0.5337, + "step": 8135, + "task_loss": 0.643834114074707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3703729510307312, + "epoch": 6.88, + "learning_rate": 1.5612848689771765e-05, + "loss": 0.393, + "step": 8136, + "task_loss": 0.4513920247554779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6426957845687866, + "epoch": 6.88, + "learning_rate": 1.560862214708369e-05, + "loss": 0.5496, + "step": 8137, + "task_loss": 0.5394741296768188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.55413818359375, + "epoch": 6.88, + "learning_rate": 1.5604395604395605e-05, + "loss": 0.4153, + "step": 8138, + "task_loss": 0.5221492052078247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24530254304409027, + "epoch": 6.88, + "learning_rate": 1.5600169061707525e-05, + "loss": 0.5809, + "step": 8139, + "task_loss": 0.34876152873039246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4040549695491791, + "epoch": 6.88, + "learning_rate": 1.5595942519019444e-05, + "loss": 0.3158, + "step": 8140, + "task_loss": 0.4428268373012543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27233025431632996, + "epoch": 6.88, + "learning_rate": 1.559171597633136e-05, + "loss": 0.3513, + "step": 8141, + "task_loss": 0.12480652332305908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23360180854797363, + "epoch": 6.88, + "learning_rate": 1.558748943364328e-05, + "loss": 0.4417, + "step": 8142, + "task_loss": 0.5513081550598145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5700194835662842, + "epoch": 6.88, + "learning_rate": 1.55832628909552e-05, + "loss": 0.5125, + "step": 8143, + "task_loss": 0.5107743740081787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30233046412467957, + "epoch": 6.88, + "learning_rate": 1.5579036348267116e-05, + "loss": 0.3763, + "step": 8144, + "task_loss": 0.6201931238174438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4131520390510559, + "epoch": 6.88, + "learning_rate": 1.557480980557904e-05, + "loss": 0.3749, + "step": 8145, + "task_loss": 0.5864682793617249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4278145432472229, + "epoch": 6.89, + "learning_rate": 1.5570583262890956e-05, + "loss": 0.4052, + "step": 8146, + "task_loss": 0.6012652516365051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5053829550743103, + "epoch": 6.89, + "learning_rate": 1.5566356720202872e-05, + "loss": 0.3906, + "step": 8147, + "task_loss": 0.4206555187702179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2978106737136841, + "epoch": 6.89, + "learning_rate": 1.5562130177514795e-05, + "loss": 0.4727, + "step": 8148, + "task_loss": 0.41058239340782166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3008463382720947, + "epoch": 6.89, + "learning_rate": 1.5557903634826712e-05, + "loss": 0.445, + "step": 8149, + "task_loss": 0.2962489724159241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6150112152099609, + "epoch": 6.89, + "learning_rate": 1.555367709213863e-05, + "loss": 0.5465, + "step": 8150, + "task_loss": 0.7760774493217468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46302351355552673, + "epoch": 6.89, + "learning_rate": 1.554945054945055e-05, + "loss": 0.4248, + "step": 8151, + "task_loss": 1.3289170265197754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28753024339675903, + "epoch": 6.89, + "learning_rate": 1.5545224006762468e-05, + "loss": 0.4847, + "step": 8152, + "task_loss": 0.47131258249282837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4746450185775757, + "epoch": 6.89, + "learning_rate": 1.5540997464074387e-05, + "loss": 0.4887, + "step": 8153, + "task_loss": 0.19067645072937012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26616308093070984, + "epoch": 6.89, + "learning_rate": 1.5536770921386307e-05, + "loss": 0.3738, + "step": 8154, + "task_loss": 0.5526065826416016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3078230321407318, + "epoch": 6.89, + "learning_rate": 1.5532544378698223e-05, + "loss": 0.3285, + "step": 8155, + "task_loss": 0.2108541578054428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5634310841560364, + "epoch": 6.89, + "learning_rate": 1.5528317836010147e-05, + "loss": 0.4527, + "step": 8156, + "task_loss": 1.3418488502502441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3835914731025696, + "epoch": 6.89, + "learning_rate": 1.5524091293322063e-05, + "loss": 0.524, + "step": 8157, + "task_loss": 1.2033747434616089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4126585125923157, + "epoch": 6.9, + "learning_rate": 1.5519864750633983e-05, + "loss": 0.3997, + "step": 8158, + "task_loss": 0.8263439536094666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3593077063560486, + "epoch": 6.9, + "learning_rate": 1.5515638207945902e-05, + "loss": 0.4915, + "step": 8159, + "task_loss": 0.3045428991317749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21504604816436768, + "epoch": 6.9, + "learning_rate": 1.551141166525782e-05, + "loss": 0.3788, + "step": 8160, + "task_loss": 0.271843284368515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3327184021472931, + "epoch": 6.9, + "learning_rate": 1.550718512256974e-05, + "loss": 0.4636, + "step": 8161, + "task_loss": 0.08891170471906662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41336962580680847, + "epoch": 6.9, + "learning_rate": 1.5502958579881658e-05, + "loss": 0.3772, + "step": 8162, + "task_loss": 0.47042813897132874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22254504263401031, + "epoch": 6.9, + "learning_rate": 1.5498732037193575e-05, + "loss": 0.4226, + "step": 8163, + "task_loss": 1.122079610824585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3511051833629608, + "epoch": 6.9, + "learning_rate": 1.5494505494505494e-05, + "loss": 0.4026, + "step": 8164, + "task_loss": 0.8661563396453857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31127703189849854, + "epoch": 6.9, + "learning_rate": 1.5490278951817414e-05, + "loss": 0.256, + "step": 8165, + "task_loss": 0.08318139612674713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32370615005493164, + "epoch": 6.9, + "learning_rate": 1.5486052409129334e-05, + "loss": 0.4869, + "step": 8166, + "task_loss": 1.0473151206970215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3904055058956146, + "epoch": 6.9, + "learning_rate": 1.5481825866441254e-05, + "loss": 0.4573, + "step": 8167, + "task_loss": 0.535521388053894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49605295062065125, + "epoch": 6.9, + "learning_rate": 1.547759932375317e-05, + "loss": 0.4706, + "step": 8168, + "task_loss": 0.9088897705078125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3962818682193756, + "epoch": 6.9, + "learning_rate": 1.547337278106509e-05, + "loss": 0.416, + "step": 8169, + "task_loss": 0.7820472121238708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31070685386657715, + "epoch": 6.91, + "learning_rate": 1.546914623837701e-05, + "loss": 0.3688, + "step": 8170, + "task_loss": 0.4734131097793579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5645719170570374, + "epoch": 6.91, + "learning_rate": 1.5464919695688926e-05, + "loss": 0.5806, + "step": 8171, + "task_loss": 0.4287545084953308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4783346951007843, + "epoch": 6.91, + "learning_rate": 1.5460693153000845e-05, + "loss": 0.4706, + "step": 8172, + "task_loss": 0.47039905190467834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36929216980934143, + "epoch": 6.91, + "learning_rate": 1.5456466610312765e-05, + "loss": 0.3397, + "step": 8173, + "task_loss": 1.265995979309082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6413061618804932, + "epoch": 6.91, + "learning_rate": 1.5452240067624685e-05, + "loss": 0.5159, + "step": 8174, + "task_loss": 1.1344407796859741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9467734098434448, + "epoch": 6.91, + "learning_rate": 1.54480135249366e-05, + "loss": 0.4928, + "step": 8175, + "task_loss": 1.5185370445251465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5044880509376526, + "epoch": 6.91, + "learning_rate": 1.544378698224852e-05, + "loss": 0.41, + "step": 8176, + "task_loss": 0.31301987171173096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5570403337478638, + "epoch": 6.91, + "learning_rate": 1.543956043956044e-05, + "loss": 0.5241, + "step": 8177, + "task_loss": 1.453322410583496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19055254757404327, + "epoch": 6.91, + "learning_rate": 1.5435333896872357e-05, + "loss": 0.3988, + "step": 8178, + "task_loss": 0.3592352867126465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18470153212547302, + "epoch": 6.91, + "learning_rate": 1.543110735418428e-05, + "loss": 0.4151, + "step": 8179, + "task_loss": 0.13249756395816803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3627029359340668, + "epoch": 6.91, + "learning_rate": 1.5426880811496197e-05, + "loss": 0.3906, + "step": 8180, + "task_loss": 0.699699878692627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3549533486366272, + "epoch": 6.91, + "learning_rate": 1.5422654268808116e-05, + "loss": 0.4659, + "step": 8181, + "task_loss": 0.49538061022758484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46586260199546814, + "epoch": 6.92, + "learning_rate": 1.5418427726120036e-05, + "loss": 0.3822, + "step": 8182, + "task_loss": 0.8771148324012756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6045873165130615, + "epoch": 6.92, + "learning_rate": 1.5414201183431952e-05, + "loss": 0.4479, + "step": 8183, + "task_loss": 1.226340651512146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4289664626121521, + "epoch": 6.92, + "learning_rate": 1.5409974640743872e-05, + "loss": 0.443, + "step": 8184, + "task_loss": 1.225841760635376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5048332810401917, + "epoch": 6.92, + "learning_rate": 1.5405748098055792e-05, + "loss": 0.3656, + "step": 8185, + "task_loss": 0.11877667158842087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.431664377450943, + "epoch": 6.92, + "learning_rate": 1.5401521555367708e-05, + "loss": 0.4556, + "step": 8186, + "task_loss": 1.0210868120193481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3943973183631897, + "epoch": 6.92, + "learning_rate": 1.539729501267963e-05, + "loss": 0.4322, + "step": 8187, + "task_loss": 0.6154343485832214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4577901363372803, + "epoch": 6.92, + "learning_rate": 1.5393068469991548e-05, + "loss": 0.3572, + "step": 8188, + "task_loss": 0.6505711674690247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3881218135356903, + "epoch": 6.92, + "learning_rate": 1.5388841927303464e-05, + "loss": 0.4488, + "step": 8189, + "task_loss": 0.6387543082237244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3238261044025421, + "epoch": 6.92, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.321, + "step": 8190, + "task_loss": 0.42165616154670715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4473949670791626, + "epoch": 6.92, + "learning_rate": 1.5380388841927304e-05, + "loss": 0.3374, + "step": 8191, + "task_loss": 0.3603472411632538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6827565431594849, + "epoch": 6.92, + "learning_rate": 1.5376162299239223e-05, + "loss": 0.4922, + "step": 8192, + "task_loss": 0.6123996376991272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30786436796188354, + "epoch": 6.93, + "learning_rate": 1.5371935756551143e-05, + "loss": 0.3905, + "step": 8193, + "task_loss": 0.6191640496253967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3981778621673584, + "epoch": 6.93, + "learning_rate": 1.536770921386306e-05, + "loss": 0.4501, + "step": 8194, + "task_loss": 0.2218705713748932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5682381391525269, + "epoch": 6.93, + "learning_rate": 1.536348267117498e-05, + "loss": 0.4223, + "step": 8195, + "task_loss": 0.29919493198394775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4306694269180298, + "epoch": 6.93, + "learning_rate": 1.53592561284869e-05, + "loss": 0.4843, + "step": 8196, + "task_loss": 0.7966792583465576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3472772240638733, + "epoch": 6.93, + "learning_rate": 1.5355029585798815e-05, + "loss": 0.3953, + "step": 8197, + "task_loss": 0.6032739877700806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31867820024490356, + "epoch": 6.93, + "learning_rate": 1.535080304311074e-05, + "loss": 0.4119, + "step": 8198, + "task_loss": 0.3969121277332306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21640893816947937, + "epoch": 6.93, + "learning_rate": 1.5346576500422655e-05, + "loss": 0.3359, + "step": 8199, + "task_loss": 1.3615880012512207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2863558530807495, + "epoch": 6.93, + "learning_rate": 1.534234995773457e-05, + "loss": 0.3792, + "step": 8200, + "task_loss": 0.1618276834487915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2689869701862335, + "epoch": 6.93, + "learning_rate": 1.5338123415046494e-05, + "loss": 0.5147, + "step": 8201, + "task_loss": 0.7231017351150513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23083674907684326, + "epoch": 6.93, + "learning_rate": 1.533389687235841e-05, + "loss": 0.3249, + "step": 8202, + "task_loss": 0.9172165393829346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3891570568084717, + "epoch": 6.93, + "learning_rate": 1.532967032967033e-05, + "loss": 0.3953, + "step": 8203, + "task_loss": 0.7521423101425171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22845682501792908, + "epoch": 6.93, + "learning_rate": 1.532544378698225e-05, + "loss": 0.3412, + "step": 8204, + "task_loss": 0.1563791185617447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21759039163589478, + "epoch": 6.94, + "learning_rate": 1.5321217244294166e-05, + "loss": 0.2823, + "step": 8205, + "task_loss": 0.5944491624832153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15138694643974304, + "epoch": 6.94, + "learning_rate": 1.5316990701606086e-05, + "loss": 0.3447, + "step": 8206, + "task_loss": 0.8165296912193298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26458975672721863, + "epoch": 6.94, + "learning_rate": 1.5312764158918006e-05, + "loss": 0.3479, + "step": 8207, + "task_loss": 0.2845596373081207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1431186944246292, + "epoch": 6.94, + "learning_rate": 1.5308537616229926e-05, + "loss": 0.3306, + "step": 8208, + "task_loss": 0.009467852301895618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3068787753582001, + "epoch": 6.94, + "learning_rate": 1.5304311073541845e-05, + "loss": 0.4213, + "step": 8209, + "task_loss": 0.8935056924819946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5556225180625916, + "epoch": 6.94, + "learning_rate": 1.530008453085376e-05, + "loss": 0.4376, + "step": 8210, + "task_loss": 0.3257346749305725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45078691840171814, + "epoch": 6.94, + "learning_rate": 1.529585798816568e-05, + "loss": 0.4185, + "step": 8211, + "task_loss": 0.7217759490013123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1918867826461792, + "epoch": 6.94, + "learning_rate": 1.52916314454776e-05, + "loss": 0.4319, + "step": 8212, + "task_loss": 0.37170982360839844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33981025218963623, + "epoch": 6.94, + "learning_rate": 1.5287404902789517e-05, + "loss": 0.4704, + "step": 8213, + "task_loss": 0.2498490959405899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3119122385978699, + "epoch": 6.94, + "learning_rate": 1.5283178360101437e-05, + "loss": 0.3156, + "step": 8214, + "task_loss": 0.2671097218990326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7428929805755615, + "epoch": 6.94, + "learning_rate": 1.5278951817413357e-05, + "loss": 0.5126, + "step": 8215, + "task_loss": 1.3264228105545044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3727587163448334, + "epoch": 6.94, + "learning_rate": 1.5274725274725277e-05, + "loss": 0.2992, + "step": 8216, + "task_loss": 0.3081343173980713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41894060373306274, + "epoch": 6.95, + "learning_rate": 1.5270498732037193e-05, + "loss": 0.4916, + "step": 8217, + "task_loss": 1.698621392250061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3773888349533081, + "epoch": 6.95, + "learning_rate": 1.5266272189349113e-05, + "loss": 0.5325, + "step": 8218, + "task_loss": 1.126094102859497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24135497212409973, + "epoch": 6.95, + "learning_rate": 1.5262045646661032e-05, + "loss": 0.36, + "step": 8219, + "task_loss": 0.14775274693965912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5234148502349854, + "epoch": 6.95, + "learning_rate": 1.525781910397295e-05, + "loss": 0.5619, + "step": 8220, + "task_loss": 0.8358763456344604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6000182628631592, + "epoch": 6.95, + "learning_rate": 1.5253592561284869e-05, + "loss": 0.4682, + "step": 8221, + "task_loss": 1.2217031717300415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3514356315135956, + "epoch": 6.95, + "learning_rate": 1.5249366018596788e-05, + "loss": 0.4394, + "step": 8222, + "task_loss": 0.9565135836601257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5196155309677124, + "epoch": 6.95, + "learning_rate": 1.5245139475908706e-05, + "loss": 0.4129, + "step": 8223, + "task_loss": 0.6463860869407654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1928631067276001, + "epoch": 6.95, + "learning_rate": 1.5240912933220628e-05, + "loss": 0.4404, + "step": 8224, + "task_loss": 0.23237508535385132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44084781408309937, + "epoch": 6.95, + "learning_rate": 1.5236686390532546e-05, + "loss": 0.427, + "step": 8225, + "task_loss": 0.04840904846787453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4103754758834839, + "epoch": 6.95, + "learning_rate": 1.5232459847844462e-05, + "loss": 0.4236, + "step": 8226, + "task_loss": 0.3935781419277191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5178766846656799, + "epoch": 6.95, + "learning_rate": 1.5228233305156384e-05, + "loss": 0.5063, + "step": 8227, + "task_loss": 0.8814098834991455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.258184015750885, + "epoch": 6.95, + "learning_rate": 1.5224006762468302e-05, + "loss": 0.2491, + "step": 8228, + "task_loss": 0.09820559620857239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36516836285591125, + "epoch": 6.96, + "learning_rate": 1.521978021978022e-05, + "loss": 0.4404, + "step": 8229, + "task_loss": 0.17960189282894135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5361000895500183, + "epoch": 6.96, + "learning_rate": 1.521555367709214e-05, + "loss": 0.6251, + "step": 8230, + "task_loss": 0.46445783972740173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5012030601501465, + "epoch": 6.96, + "learning_rate": 1.5211327134404057e-05, + "loss": 0.3734, + "step": 8231, + "task_loss": 1.2340478897094727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2361038774251938, + "epoch": 6.96, + "learning_rate": 1.5207100591715979e-05, + "loss": 0.3507, + "step": 8232, + "task_loss": 0.4812510311603546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31439924240112305, + "epoch": 6.96, + "learning_rate": 1.5202874049027895e-05, + "loss": 0.3039, + "step": 8233, + "task_loss": 0.708295464515686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6341074705123901, + "epoch": 6.96, + "learning_rate": 1.5198647506339813e-05, + "loss": 0.484, + "step": 8234, + "task_loss": 1.037630558013916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6677741408348083, + "epoch": 6.96, + "learning_rate": 1.5194420963651735e-05, + "loss": 0.5158, + "step": 8235, + "task_loss": 0.7357756495475769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18401606380939484, + "epoch": 6.96, + "learning_rate": 1.5190194420963653e-05, + "loss": 0.3939, + "step": 8236, + "task_loss": 0.5122273564338684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42348361015319824, + "epoch": 6.96, + "learning_rate": 1.5185967878275573e-05, + "loss": 0.3968, + "step": 8237, + "task_loss": 0.9195177555084229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47977378964424133, + "epoch": 6.96, + "learning_rate": 1.518174133558749e-05, + "loss": 0.4024, + "step": 8238, + "task_loss": 0.5926809906959534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4956817626953125, + "epoch": 6.96, + "learning_rate": 1.5177514792899409e-05, + "loss": 0.4297, + "step": 8239, + "task_loss": 0.5607907772064209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.454342782497406, + "epoch": 6.96, + "learning_rate": 1.5173288250211328e-05, + "loss": 0.4335, + "step": 8240, + "task_loss": 0.32166731357574463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6324416399002075, + "epoch": 6.97, + "learning_rate": 1.5169061707523246e-05, + "loss": 0.4456, + "step": 8241, + "task_loss": 0.3131512701511383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2104112207889557, + "epoch": 6.97, + "learning_rate": 1.5164835164835164e-05, + "loss": 0.2718, + "step": 8242, + "task_loss": 0.14100228250026703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27196985483169556, + "epoch": 6.97, + "learning_rate": 1.5160608622147084e-05, + "loss": 0.4041, + "step": 8243, + "task_loss": 0.32193055748939514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32567691802978516, + "epoch": 6.97, + "learning_rate": 1.5156382079459002e-05, + "loss": 0.3799, + "step": 8244, + "task_loss": 0.7688029408454895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31179162859916687, + "epoch": 6.97, + "learning_rate": 1.5152155536770924e-05, + "loss": 0.3678, + "step": 8245, + "task_loss": 0.6757123470306396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32105210423469543, + "epoch": 6.97, + "learning_rate": 1.5147928994082842e-05, + "loss": 0.4709, + "step": 8246, + "task_loss": 0.4627591669559479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36061960458755493, + "epoch": 6.97, + "learning_rate": 1.514370245139476e-05, + "loss": 0.3984, + "step": 8247, + "task_loss": 0.561049222946167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3668675422668457, + "epoch": 6.97, + "learning_rate": 1.513947590870668e-05, + "loss": 0.4749, + "step": 8248, + "task_loss": 0.8205997347831726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29102611541748047, + "epoch": 6.97, + "learning_rate": 1.5135249366018598e-05, + "loss": 0.3436, + "step": 8249, + "task_loss": 0.5333042144775391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7841709852218628, + "epoch": 6.97, + "learning_rate": 1.5131022823330516e-05, + "loss": 0.5745, + "step": 8250, + "task_loss": 1.2410589456558228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3313364088535309, + "epoch": 6.97, + "learning_rate": 1.5126796280642435e-05, + "loss": 0.391, + "step": 8251, + "task_loss": 0.29525822401046753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2859940826892853, + "epoch": 6.97, + "learning_rate": 1.5122569737954353e-05, + "loss": 0.453, + "step": 8252, + "task_loss": 1.2679792642593384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29265299439430237, + "epoch": 6.98, + "learning_rate": 1.5118343195266275e-05, + "loss": 0.332, + "step": 8253, + "task_loss": 0.7017331719398499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2722637355327606, + "epoch": 6.98, + "learning_rate": 1.5114116652578191e-05, + "loss": 0.4774, + "step": 8254, + "task_loss": 0.4859996438026428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4201558232307434, + "epoch": 6.98, + "learning_rate": 1.510989010989011e-05, + "loss": 0.4082, + "step": 8255, + "task_loss": 0.4891192317008972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36740803718566895, + "epoch": 6.98, + "learning_rate": 1.510566356720203e-05, + "loss": 0.3569, + "step": 8256, + "task_loss": 0.5057170391082764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3489178717136383, + "epoch": 6.98, + "learning_rate": 1.5101437024513949e-05, + "loss": 0.3125, + "step": 8257, + "task_loss": 0.30894824862480164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25977516174316406, + "epoch": 6.98, + "learning_rate": 1.5097210481825865e-05, + "loss": 0.3352, + "step": 8258, + "task_loss": 0.7753267884254456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4740365743637085, + "epoch": 6.98, + "learning_rate": 1.5092983939137786e-05, + "loss": 0.5811, + "step": 8259, + "task_loss": 1.9040555953979492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33003661036491394, + "epoch": 6.98, + "learning_rate": 1.5088757396449705e-05, + "loss": 0.4145, + "step": 8260, + "task_loss": 0.4096348285675049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3292182981967926, + "epoch": 6.98, + "learning_rate": 1.5084530853761624e-05, + "loss": 0.471, + "step": 8261, + "task_loss": 0.49542826414108276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5234651565551758, + "epoch": 6.98, + "learning_rate": 1.5080304311073542e-05, + "loss": 0.512, + "step": 8262, + "task_loss": 0.5969032049179077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36772823333740234, + "epoch": 6.98, + "learning_rate": 1.507607776838546e-05, + "loss": 0.3241, + "step": 8263, + "task_loss": 0.08594870567321777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7351909875869751, + "epoch": 6.99, + "learning_rate": 1.5071851225697382e-05, + "loss": 0.5296, + "step": 8264, + "task_loss": 0.7577266693115234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3588067591190338, + "epoch": 6.99, + "learning_rate": 1.5067624683009298e-05, + "loss": 0.4477, + "step": 8265, + "task_loss": 1.1385146379470825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3751617670059204, + "epoch": 6.99, + "learning_rate": 1.506339814032122e-05, + "loss": 0.4245, + "step": 8266, + "task_loss": 0.3227076232433319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4811413884162903, + "epoch": 6.99, + "learning_rate": 1.5059171597633138e-05, + "loss": 0.4525, + "step": 8267, + "task_loss": 1.201701045036316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5206922292709351, + "epoch": 6.99, + "learning_rate": 1.5054945054945056e-05, + "loss": 0.4455, + "step": 8268, + "task_loss": 1.453420639038086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1632324755191803, + "epoch": 6.99, + "learning_rate": 1.5050718512256975e-05, + "loss": 0.3331, + "step": 8269, + "task_loss": 0.027013765648007393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5707831382751465, + "epoch": 6.99, + "learning_rate": 1.5046491969568893e-05, + "loss": 0.4053, + "step": 8270, + "task_loss": 0.11289359629154205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4111563563346863, + "epoch": 6.99, + "learning_rate": 1.5042265426880811e-05, + "loss": 0.3912, + "step": 8271, + "task_loss": 0.9436833262443542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7225178480148315, + "epoch": 6.99, + "learning_rate": 1.5038038884192731e-05, + "loss": 0.465, + "step": 8272, + "task_loss": 0.5543030500411987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3046078085899353, + "epoch": 6.99, + "learning_rate": 1.503381234150465e-05, + "loss": 0.3233, + "step": 8273, + "task_loss": 0.2614608108997345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3342648148536682, + "epoch": 6.99, + "learning_rate": 1.502958579881657e-05, + "loss": 0.408, + "step": 8274, + "task_loss": 0.34398412704467773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3740657567977905, + "epoch": 6.99, + "learning_rate": 1.5025359256128487e-05, + "loss": 0.387, + "step": 8275, + "task_loss": 0.8366837501525879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38045695424079895, + "epoch": 7.0, + "learning_rate": 1.5021132713440405e-05, + "loss": 0.4089, + "step": 8276, + "task_loss": 1.1091407537460327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2700023353099823, + "epoch": 7.0, + "learning_rate": 1.5016906170752327e-05, + "loss": 0.4341, + "step": 8277, + "task_loss": 0.7705312967300415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5198190212249756, + "epoch": 7.0, + "learning_rate": 1.5012679628064245e-05, + "loss": 0.4332, + "step": 8278, + "task_loss": 0.9903658032417297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24078455567359924, + "epoch": 7.0, + "learning_rate": 1.5008453085376163e-05, + "loss": 0.3135, + "step": 8279, + "task_loss": 0.14397670328617096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7521635293960571, + "epoch": 7.0, + "learning_rate": 1.5004226542688082e-05, + "loss": 0.4739, + "step": 8280, + "task_loss": 0.29760488867759705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28123408555984497, + "epoch": 7.0, + "learning_rate": 1.5e-05, + "loss": 0.4135, + "step": 8281, + "task_loss": 0.2527936100959778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4413542151451111, + "epoch": 7.0, + "learning_rate": 1.499577345731192e-05, + "loss": 0.6427, + "step": 8282, + "task_loss": 0.5521907210350037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46478211879730225, + "epoch": 7.0, + "learning_rate": 1.4991546914623838e-05, + "loss": 0.3155, + "step": 8283, + "task_loss": 0.31751516461372375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2643257677555084, + "epoch": 7.0, + "learning_rate": 1.4987320371935756e-05, + "loss": 0.3724, + "step": 8284, + "task_loss": 0.4382242262363434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39968597888946533, + "epoch": 7.0, + "learning_rate": 1.4983093829247678e-05, + "loss": 0.4576, + "step": 8285, + "task_loss": 0.6146531701087952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35490909218788147, + "epoch": 7.0, + "learning_rate": 1.4978867286559594e-05, + "loss": 0.4603, + "step": 8286, + "task_loss": 0.6384093165397644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36409473419189453, + "epoch": 7.01, + "learning_rate": 1.4974640743871512e-05, + "loss": 0.3614, + "step": 8287, + "task_loss": 0.720887303352356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45239415764808655, + "epoch": 7.01, + "learning_rate": 1.4970414201183433e-05, + "loss": 0.4816, + "step": 8288, + "task_loss": 0.9364504814147949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5099371671676636, + "epoch": 7.01, + "learning_rate": 1.4966187658495352e-05, + "loss": 0.4192, + "step": 8289, + "task_loss": 0.6691038012504578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36551812291145325, + "epoch": 7.01, + "learning_rate": 1.4961961115807271e-05, + "loss": 0.3389, + "step": 8290, + "task_loss": 0.7291792035102844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.574684739112854, + "epoch": 7.01, + "learning_rate": 1.495773457311919e-05, + "loss": 0.4111, + "step": 8291, + "task_loss": 0.5037595629692078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2524784803390503, + "epoch": 7.01, + "learning_rate": 1.4953508030431107e-05, + "loss": 0.4496, + "step": 8292, + "task_loss": 0.6400421261787415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23396465182304382, + "epoch": 7.01, + "learning_rate": 1.4949281487743027e-05, + "loss": 0.2952, + "step": 8293, + "task_loss": 0.20518313348293304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3309219181537628, + "epoch": 7.01, + "learning_rate": 1.4945054945054945e-05, + "loss": 0.377, + "step": 8294, + "task_loss": 1.1114895343780518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4212479889392853, + "epoch": 7.01, + "learning_rate": 1.4940828402366867e-05, + "loss": 0.4089, + "step": 8295, + "task_loss": 0.11287232488393784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24975335597991943, + "epoch": 7.01, + "learning_rate": 1.4936601859678785e-05, + "loss": 0.408, + "step": 8296, + "task_loss": 0.24343259632587433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45902740955352783, + "epoch": 7.01, + "learning_rate": 1.4932375316990701e-05, + "loss": 0.4378, + "step": 8297, + "task_loss": 0.9787672162055969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26421287655830383, + "epoch": 7.01, + "learning_rate": 1.4928148774302622e-05, + "loss": 0.3334, + "step": 8298, + "task_loss": 0.190673828125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41736918687820435, + "epoch": 7.02, + "learning_rate": 1.492392223161454e-05, + "loss": 0.4539, + "step": 8299, + "task_loss": 0.8382971882820129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5746397972106934, + "epoch": 7.02, + "learning_rate": 1.4919695688926458e-05, + "loss": 0.4608, + "step": 8300, + "task_loss": 0.9622389078140259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3626786470413208, + "epoch": 7.02, + "learning_rate": 1.4915469146238378e-05, + "loss": 0.4559, + "step": 8301, + "task_loss": 0.8607772588729858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8122441172599792, + "epoch": 7.02, + "learning_rate": 1.4911242603550296e-05, + "loss": 0.4078, + "step": 8302, + "task_loss": 0.15627838671207428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6141949892044067, + "epoch": 7.02, + "learning_rate": 1.4907016060862216e-05, + "loss": 0.475, + "step": 8303, + "task_loss": 1.0471584796905518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2582205533981323, + "epoch": 7.02, + "learning_rate": 1.4902789518174134e-05, + "loss": 0.4039, + "step": 8304, + "task_loss": 1.020057201385498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 1.1612155437469482, + "epoch": 7.02, + "learning_rate": 1.4898562975486052e-05, + "loss": 0.6594, + "step": 8305, + "task_loss": 1.2823190689086914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45852434635162354, + "epoch": 7.02, + "learning_rate": 1.4894336432797974e-05, + "loss": 0.4157, + "step": 8306, + "task_loss": 0.6477389335632324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2611725330352783, + "epoch": 7.02, + "learning_rate": 1.489010989010989e-05, + "loss": 0.4112, + "step": 8307, + "task_loss": 0.11670973151922226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2922886610031128, + "epoch": 7.02, + "learning_rate": 1.4885883347421808e-05, + "loss": 0.4607, + "step": 8308, + "task_loss": 0.5235911011695862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20869910717010498, + "epoch": 7.02, + "learning_rate": 1.488165680473373e-05, + "loss": 0.3471, + "step": 8309, + "task_loss": 0.1230420172214508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4804826080799103, + "epoch": 7.02, + "learning_rate": 1.4877430262045647e-05, + "loss": 0.3813, + "step": 8310, + "task_loss": 0.37991058826446533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2190600484609604, + "epoch": 7.03, + "learning_rate": 1.4873203719357567e-05, + "loss": 0.3699, + "step": 8311, + "task_loss": 0.8183318376541138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5728894472122192, + "epoch": 7.03, + "learning_rate": 1.4868977176669485e-05, + "loss": 0.4008, + "step": 8312, + "task_loss": 1.2204796075820923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49801915884017944, + "epoch": 7.03, + "learning_rate": 1.4864750633981403e-05, + "loss": 0.5616, + "step": 8313, + "task_loss": 1.0254170894622803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4623652994632721, + "epoch": 7.03, + "learning_rate": 1.4860524091293323e-05, + "loss": 0.3879, + "step": 8314, + "task_loss": 1.3507338762283325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3845575153827667, + "epoch": 7.03, + "learning_rate": 1.4856297548605241e-05, + "loss": 0.5037, + "step": 8315, + "task_loss": 1.0278867483139038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3353651165962219, + "epoch": 7.03, + "learning_rate": 1.4852071005917159e-05, + "loss": 0.3757, + "step": 8316, + "task_loss": 0.23556576669216156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34871047735214233, + "epoch": 7.03, + "learning_rate": 1.484784446322908e-05, + "loss": 0.4148, + "step": 8317, + "task_loss": 0.43021389842033386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22020882368087769, + "epoch": 7.03, + "learning_rate": 1.4843617920540997e-05, + "loss": 0.2869, + "step": 8318, + "task_loss": 0.2829124927520752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3685554265975952, + "epoch": 7.03, + "learning_rate": 1.4839391377852918e-05, + "loss": 0.3395, + "step": 8319, + "task_loss": 0.4097379744052887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3138637840747833, + "epoch": 7.03, + "learning_rate": 1.4835164835164836e-05, + "loss": 0.4665, + "step": 8320, + "task_loss": 0.4797903597354889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4466157853603363, + "epoch": 7.03, + "learning_rate": 1.4830938292476754e-05, + "loss": 0.4033, + "step": 8321, + "task_loss": 0.6313538551330566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49122291803359985, + "epoch": 7.03, + "learning_rate": 1.4826711749788674e-05, + "loss": 0.4366, + "step": 8322, + "task_loss": 0.5350803732872009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6044434309005737, + "epoch": 7.04, + "learning_rate": 1.4822485207100592e-05, + "loss": 0.5513, + "step": 8323, + "task_loss": 0.7601437568664551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1923898309469223, + "epoch": 7.04, + "learning_rate": 1.4818258664412512e-05, + "loss": 0.3461, + "step": 8324, + "task_loss": 0.09433046728372574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5911145210266113, + "epoch": 7.04, + "learning_rate": 1.481403212172443e-05, + "loss": 0.5336, + "step": 8325, + "task_loss": 0.2179163098335266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2999769151210785, + "epoch": 7.04, + "learning_rate": 1.4809805579036348e-05, + "loss": 0.3369, + "step": 8326, + "task_loss": 0.18105952441692352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32190385460853577, + "epoch": 7.04, + "learning_rate": 1.480557903634827e-05, + "loss": 0.4078, + "step": 8327, + "task_loss": 0.7043678164482117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38484692573547363, + "epoch": 7.04, + "learning_rate": 1.4801352493660187e-05, + "loss": 0.4162, + "step": 8328, + "task_loss": 0.25875455141067505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3129620850086212, + "epoch": 7.04, + "learning_rate": 1.4797125950972104e-05, + "loss": 0.4042, + "step": 8329, + "task_loss": 0.20342200994491577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.556844174861908, + "epoch": 7.04, + "learning_rate": 1.4792899408284025e-05, + "loss": 0.3933, + "step": 8330, + "task_loss": 0.3725518584251404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3196388781070709, + "epoch": 7.04, + "learning_rate": 1.4788672865595943e-05, + "loss": 0.4744, + "step": 8331, + "task_loss": 0.9997777342796326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33940571546554565, + "epoch": 7.04, + "learning_rate": 1.4784446322907863e-05, + "loss": 0.4123, + "step": 8332, + "task_loss": 0.48851415514945984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31779640913009644, + "epoch": 7.04, + "learning_rate": 1.4780219780219781e-05, + "loss": 0.3807, + "step": 8333, + "task_loss": 0.24944418668746948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4143342971801758, + "epoch": 7.04, + "learning_rate": 1.4775993237531699e-05, + "loss": 0.3972, + "step": 8334, + "task_loss": 0.5147993564605713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39723363518714905, + "epoch": 7.05, + "learning_rate": 1.4771766694843619e-05, + "loss": 0.4624, + "step": 8335, + "task_loss": 1.3776476383209229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5259736776351929, + "epoch": 7.05, + "learning_rate": 1.4767540152155537e-05, + "loss": 0.4428, + "step": 8336, + "task_loss": 1.361332654953003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5901424884796143, + "epoch": 7.05, + "learning_rate": 1.4763313609467455e-05, + "loss": 0.4203, + "step": 8337, + "task_loss": 0.3991377651691437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2562532424926758, + "epoch": 7.05, + "learning_rate": 1.4759087066779376e-05, + "loss": 0.3027, + "step": 8338, + "task_loss": 0.2564619779586792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8435754776000977, + "epoch": 7.05, + "learning_rate": 1.4754860524091293e-05, + "loss": 0.5915, + "step": 8339, + "task_loss": 1.1529139280319214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2510181665420532, + "epoch": 7.05, + "learning_rate": 1.4750633981403214e-05, + "loss": 0.3461, + "step": 8340, + "task_loss": 0.0932350903749466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44214338064193726, + "epoch": 7.05, + "learning_rate": 1.4746407438715132e-05, + "loss": 0.3211, + "step": 8341, + "task_loss": 0.5769551396369934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4646173119544983, + "epoch": 7.05, + "learning_rate": 1.474218089602705e-05, + "loss": 0.4324, + "step": 8342, + "task_loss": 0.6291733980178833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3288612365722656, + "epoch": 7.05, + "learning_rate": 1.473795435333897e-05, + "loss": 0.4527, + "step": 8343, + "task_loss": 0.5451481342315674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24552994966506958, + "epoch": 7.05, + "learning_rate": 1.4733727810650888e-05, + "loss": 0.3535, + "step": 8344, + "task_loss": 0.557092010974884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4155973494052887, + "epoch": 7.05, + "learning_rate": 1.4729501267962806e-05, + "loss": 0.4328, + "step": 8345, + "task_loss": 1.024409532546997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1909499615430832, + "epoch": 7.05, + "learning_rate": 1.4725274725274726e-05, + "loss": 0.4025, + "step": 8346, + "task_loss": 0.15733684599399567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40135157108306885, + "epoch": 7.06, + "learning_rate": 1.4721048182586644e-05, + "loss": 0.4593, + "step": 8347, + "task_loss": 0.6721363067626953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3227062523365021, + "epoch": 7.06, + "learning_rate": 1.4716821639898565e-05, + "loss": 0.3324, + "step": 8348, + "task_loss": 0.661353588104248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6425338387489319, + "epoch": 7.06, + "learning_rate": 1.4712595097210483e-05, + "loss": 0.5286, + "step": 8349, + "task_loss": 0.9753245115280151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27931398153305054, + "epoch": 7.06, + "learning_rate": 1.47083685545224e-05, + "loss": 0.3629, + "step": 8350, + "task_loss": 0.4048572778701782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40779945254325867, + "epoch": 7.06, + "learning_rate": 1.4704142011834321e-05, + "loss": 0.411, + "step": 8351, + "task_loss": 1.1448734998703003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5855027437210083, + "epoch": 7.06, + "learning_rate": 1.4699915469146239e-05, + "loss": 0.346, + "step": 8352, + "task_loss": 0.46638232469558716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43462449312210083, + "epoch": 7.06, + "learning_rate": 1.4695688926458159e-05, + "loss": 0.3666, + "step": 8353, + "task_loss": 0.7084907293319702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.395760715007782, + "epoch": 7.06, + "learning_rate": 1.4691462383770077e-05, + "loss": 0.3936, + "step": 8354, + "task_loss": 0.5238751769065857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.505798876285553, + "epoch": 7.06, + "learning_rate": 1.4687235841081995e-05, + "loss": 0.3663, + "step": 8355, + "task_loss": 0.529261589050293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29745352268218994, + "epoch": 7.06, + "learning_rate": 1.4683009298393915e-05, + "loss": 0.3556, + "step": 8356, + "task_loss": 0.4951339662075043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2606540024280548, + "epoch": 7.06, + "learning_rate": 1.4678782755705833e-05, + "loss": 0.5152, + "step": 8357, + "task_loss": 1.1197713613510132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3915786147117615, + "epoch": 7.07, + "learning_rate": 1.467455621301775e-05, + "loss": 0.559, + "step": 8358, + "task_loss": 0.6039315462112427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48208317160606384, + "epoch": 7.07, + "learning_rate": 1.4670329670329672e-05, + "loss": 0.5084, + "step": 8359, + "task_loss": 0.703864574432373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3494284749031067, + "epoch": 7.07, + "learning_rate": 1.466610312764159e-05, + "loss": 0.3168, + "step": 8360, + "task_loss": 0.591950535774231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5578014850616455, + "epoch": 7.07, + "learning_rate": 1.466187658495351e-05, + "loss": 0.3459, + "step": 8361, + "task_loss": 0.8674119114875793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3628447651863098, + "epoch": 7.07, + "learning_rate": 1.4657650042265428e-05, + "loss": 0.3516, + "step": 8362, + "task_loss": 0.5179589986801147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3679960370063782, + "epoch": 7.07, + "learning_rate": 1.4653423499577346e-05, + "loss": 0.4046, + "step": 8363, + "task_loss": 0.37846291065216064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29077625274658203, + "epoch": 7.07, + "learning_rate": 1.4649196956889266e-05, + "loss": 0.4831, + "step": 8364, + "task_loss": 0.55636066198349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22993029654026031, + "epoch": 7.07, + "learning_rate": 1.4644970414201184e-05, + "loss": 0.3161, + "step": 8365, + "task_loss": 0.5594084858894348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5078344345092773, + "epoch": 7.07, + "learning_rate": 1.4640743871513102e-05, + "loss": 0.3597, + "step": 8366, + "task_loss": 0.3144093453884125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21866194903850555, + "epoch": 7.07, + "learning_rate": 1.4636517328825022e-05, + "loss": 0.2842, + "step": 8367, + "task_loss": 0.03512582555413246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6771223545074463, + "epoch": 7.07, + "learning_rate": 1.463229078613694e-05, + "loss": 0.4666, + "step": 8368, + "task_loss": 1.0261646509170532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30612093210220337, + "epoch": 7.07, + "learning_rate": 1.4628064243448861e-05, + "loss": 0.423, + "step": 8369, + "task_loss": 0.08478358387947083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2628135085105896, + "epoch": 7.08, + "learning_rate": 1.462383770076078e-05, + "loss": 0.5317, + "step": 8370, + "task_loss": 0.658598780632019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4806727170944214, + "epoch": 7.08, + "learning_rate": 1.4619611158072696e-05, + "loss": 0.3946, + "step": 8371, + "task_loss": 0.674254834651947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3670101463794708, + "epoch": 7.08, + "learning_rate": 1.4615384615384617e-05, + "loss": 0.4243, + "step": 8372, + "task_loss": 1.2578107118606567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3100663423538208, + "epoch": 7.08, + "learning_rate": 1.4611158072696535e-05, + "loss": 0.4452, + "step": 8373, + "task_loss": 0.8208237886428833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43752744793891907, + "epoch": 7.08, + "learning_rate": 1.4606931530008453e-05, + "loss": 0.3856, + "step": 8374, + "task_loss": 0.16295449435710907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3186638653278351, + "epoch": 7.08, + "learning_rate": 1.4602704987320373e-05, + "loss": 0.4149, + "step": 8375, + "task_loss": 0.2676573395729065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.343463659286499, + "epoch": 7.08, + "learning_rate": 1.459847844463229e-05, + "loss": 0.3776, + "step": 8376, + "task_loss": 1.2625263929367065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39731496572494507, + "epoch": 7.08, + "learning_rate": 1.4594251901944212e-05, + "loss": 0.3956, + "step": 8377, + "task_loss": 0.41253435611724854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3758246898651123, + "epoch": 7.08, + "learning_rate": 1.4590025359256129e-05, + "loss": 0.4325, + "step": 8378, + "task_loss": 0.647294282913208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32561200857162476, + "epoch": 7.08, + "learning_rate": 1.4585798816568047e-05, + "loss": 0.3627, + "step": 8379, + "task_loss": 0.5770902037620544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5017141103744507, + "epoch": 7.08, + "learning_rate": 1.4581572273879968e-05, + "loss": 0.4986, + "step": 8380, + "task_loss": 2.031759262084961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3875226378440857, + "epoch": 7.08, + "learning_rate": 1.4577345731191886e-05, + "loss": 0.3372, + "step": 8381, + "task_loss": 0.6885057091712952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4345259666442871, + "epoch": 7.09, + "learning_rate": 1.4573119188503802e-05, + "loss": 0.4647, + "step": 8382, + "task_loss": 0.7620514035224915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26603513956069946, + "epoch": 7.09, + "learning_rate": 1.4568892645815724e-05, + "loss": 0.4349, + "step": 8383, + "task_loss": 0.5241237282752991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.507420003414154, + "epoch": 7.09, + "learning_rate": 1.4564666103127642e-05, + "loss": 0.3472, + "step": 8384, + "task_loss": 0.7135237455368042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.521060585975647, + "epoch": 7.09, + "learning_rate": 1.4560439560439562e-05, + "loss": 0.4545, + "step": 8385, + "task_loss": 0.6666771769523621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3788931965827942, + "epoch": 7.09, + "learning_rate": 1.455621301775148e-05, + "loss": 0.4404, + "step": 8386, + "task_loss": 0.45320555567741394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6414828300476074, + "epoch": 7.09, + "learning_rate": 1.4551986475063398e-05, + "loss": 0.4703, + "step": 8387, + "task_loss": 0.5006552338600159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31551817059516907, + "epoch": 7.09, + "learning_rate": 1.4547759932375318e-05, + "loss": 0.5051, + "step": 8388, + "task_loss": 0.8855440020561218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27681753039360046, + "epoch": 7.09, + "learning_rate": 1.4543533389687236e-05, + "loss": 0.3167, + "step": 8389, + "task_loss": 1.2749816179275513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4604836702346802, + "epoch": 7.09, + "learning_rate": 1.4539306846999157e-05, + "loss": 0.4733, + "step": 8390, + "task_loss": 0.7168909907341003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29975900053977966, + "epoch": 7.09, + "learning_rate": 1.4535080304311075e-05, + "loss": 0.394, + "step": 8391, + "task_loss": 0.592528223991394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37415575981140137, + "epoch": 7.09, + "learning_rate": 1.4530853761622993e-05, + "loss": 0.3915, + "step": 8392, + "task_loss": 0.7118204832077026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3613831698894501, + "epoch": 7.09, + "learning_rate": 1.4526627218934913e-05, + "loss": 0.4351, + "step": 8393, + "task_loss": 0.36225467920303345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3314611613750458, + "epoch": 7.1, + "learning_rate": 1.4522400676246831e-05, + "loss": 0.42, + "step": 8394, + "task_loss": 0.6385670900344849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39198702573776245, + "epoch": 7.1, + "learning_rate": 1.4518174133558749e-05, + "loss": 0.3671, + "step": 8395, + "task_loss": 0.19258631765842438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5684965252876282, + "epoch": 7.1, + "learning_rate": 1.4513947590870669e-05, + "loss": 0.4379, + "step": 8396, + "task_loss": 0.8100693225860596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44245094060897827, + "epoch": 7.1, + "learning_rate": 1.4509721048182587e-05, + "loss": 0.4298, + "step": 8397, + "task_loss": 0.8396496176719666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3851650357246399, + "epoch": 7.1, + "learning_rate": 1.4505494505494508e-05, + "loss": 0.3718, + "step": 8398, + "task_loss": 1.311450481414795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2873406708240509, + "epoch": 7.1, + "learning_rate": 1.4501267962806425e-05, + "loss": 0.3326, + "step": 8399, + "task_loss": 0.7790699005126953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25097569823265076, + "epoch": 7.1, + "learning_rate": 1.4497041420118343e-05, + "loss": 0.4242, + "step": 8400, + "task_loss": 0.4699631929397583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4071921706199646, + "epoch": 7.1, + "learning_rate": 1.4492814877430264e-05, + "loss": 0.3151, + "step": 8401, + "task_loss": 0.40444299578666687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4477057456970215, + "epoch": 7.1, + "learning_rate": 1.4488588334742182e-05, + "loss": 0.4014, + "step": 8402, + "task_loss": 0.34798693656921387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21141371130943298, + "epoch": 7.1, + "learning_rate": 1.4484361792054098e-05, + "loss": 0.3206, + "step": 8403, + "task_loss": 0.2571597397327423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3580038547515869, + "epoch": 7.1, + "learning_rate": 1.448013524936602e-05, + "loss": 0.4085, + "step": 8404, + "task_loss": 0.24061186611652374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33964747190475464, + "epoch": 7.1, + "learning_rate": 1.4475908706677938e-05, + "loss": 0.3618, + "step": 8405, + "task_loss": 0.7125963568687439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46652156114578247, + "epoch": 7.11, + "learning_rate": 1.4471682163989858e-05, + "loss": 0.5561, + "step": 8406, + "task_loss": 0.43365633487701416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43037891387939453, + "epoch": 7.11, + "learning_rate": 1.4467455621301776e-05, + "loss": 0.4245, + "step": 8407, + "task_loss": 1.1133359670639038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.576919436454773, + "epoch": 7.11, + "learning_rate": 1.4463229078613694e-05, + "loss": 0.4285, + "step": 8408, + "task_loss": 1.2657321691513062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26890116930007935, + "epoch": 7.11, + "learning_rate": 1.4459002535925615e-05, + "loss": 0.373, + "step": 8409, + "task_loss": 0.5192141532897949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25023606419563293, + "epoch": 7.11, + "learning_rate": 1.4454775993237531e-05, + "loss": 0.2288, + "step": 8410, + "task_loss": 0.37080201506614685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26618292927742004, + "epoch": 7.11, + "learning_rate": 1.445054945054945e-05, + "loss": 0.3846, + "step": 8411, + "task_loss": 0.5393050312995911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3965601623058319, + "epoch": 7.11, + "learning_rate": 1.4446322907861371e-05, + "loss": 0.3517, + "step": 8412, + "task_loss": 0.3262026607990265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7163786888122559, + "epoch": 7.11, + "learning_rate": 1.4442096365173289e-05, + "loss": 0.4877, + "step": 8413, + "task_loss": 1.8616886138916016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.779911994934082, + "epoch": 7.11, + "learning_rate": 1.4437869822485209e-05, + "loss": 0.4698, + "step": 8414, + "task_loss": 0.9641918540000916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2862231731414795, + "epoch": 7.11, + "learning_rate": 1.4433643279797127e-05, + "loss": 0.3652, + "step": 8415, + "task_loss": 0.46664008498191833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25905841588974, + "epoch": 7.11, + "learning_rate": 1.4429416737109045e-05, + "loss": 0.3587, + "step": 8416, + "task_loss": 0.2576560974121094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.328513503074646, + "epoch": 7.11, + "learning_rate": 1.4425190194420965e-05, + "loss": 0.4159, + "step": 8417, + "task_loss": 0.9867499470710754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3979632556438446, + "epoch": 7.12, + "learning_rate": 1.4420963651732883e-05, + "loss": 0.3711, + "step": 8418, + "task_loss": 0.44069308042526245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3352411985397339, + "epoch": 7.12, + "learning_rate": 1.4416737109044804e-05, + "loss": 0.4819, + "step": 8419, + "task_loss": 0.5708295106887817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4830598533153534, + "epoch": 7.12, + "learning_rate": 1.441251056635672e-05, + "loss": 0.3979, + "step": 8420, + "task_loss": 0.29789814352989197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6309598684310913, + "epoch": 7.12, + "learning_rate": 1.4408284023668638e-05, + "loss": 0.4275, + "step": 8421, + "task_loss": 1.065609097480774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5073386430740356, + "epoch": 7.12, + "learning_rate": 1.440405748098056e-05, + "loss": 0.4938, + "step": 8422, + "task_loss": 1.0641427040100098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41760551929473877, + "epoch": 7.12, + "learning_rate": 1.4399830938292478e-05, + "loss": 0.3676, + "step": 8423, + "task_loss": 1.1484755277633667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29046791791915894, + "epoch": 7.12, + "learning_rate": 1.4395604395604396e-05, + "loss": 0.3881, + "step": 8424, + "task_loss": 0.41065356135368347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27040597796440125, + "epoch": 7.12, + "learning_rate": 1.4391377852916316e-05, + "loss": 0.4059, + "step": 8425, + "task_loss": 1.3025606870651245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2354079782962799, + "epoch": 7.12, + "learning_rate": 1.4387151310228234e-05, + "loss": 0.4741, + "step": 8426, + "task_loss": 0.25394389033317566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4093613922595978, + "epoch": 7.12, + "learning_rate": 1.4382924767540153e-05, + "loss": 0.3411, + "step": 8427, + "task_loss": 0.3733874559402466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19723108410835266, + "epoch": 7.12, + "learning_rate": 1.4378698224852072e-05, + "loss": 0.5138, + "step": 8428, + "task_loss": 0.60390305519104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4218757152557373, + "epoch": 7.13, + "learning_rate": 1.437447168216399e-05, + "loss": 0.4909, + "step": 8429, + "task_loss": 0.5015130043029785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28441160917282104, + "epoch": 7.13, + "learning_rate": 1.4370245139475911e-05, + "loss": 0.4701, + "step": 8430, + "task_loss": 0.6278700828552246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45802924036979675, + "epoch": 7.13, + "learning_rate": 1.4366018596787827e-05, + "loss": 0.4951, + "step": 8431, + "task_loss": 0.5403968095779419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4281238317489624, + "epoch": 7.13, + "learning_rate": 1.4361792054099745e-05, + "loss": 0.3258, + "step": 8432, + "task_loss": 0.6291800141334534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5249758362770081, + "epoch": 7.13, + "learning_rate": 1.4357565511411667e-05, + "loss": 0.3574, + "step": 8433, + "task_loss": 0.6581066846847534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40616482496261597, + "epoch": 7.13, + "learning_rate": 1.4353338968723585e-05, + "loss": 0.3343, + "step": 8434, + "task_loss": 0.7830513119697571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28221940994262695, + "epoch": 7.13, + "learning_rate": 1.4349112426035505e-05, + "loss": 0.4968, + "step": 8435, + "task_loss": 0.3690681755542755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45464053750038147, + "epoch": 7.13, + "learning_rate": 1.4344885883347423e-05, + "loss": 0.4114, + "step": 8436, + "task_loss": 0.5992862582206726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26880329847335815, + "epoch": 7.13, + "learning_rate": 1.434065934065934e-05, + "loss": 0.3347, + "step": 8437, + "task_loss": 0.8009253740310669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6621055603027344, + "epoch": 7.13, + "learning_rate": 1.433643279797126e-05, + "loss": 0.557, + "step": 8438, + "task_loss": 0.5487411022186279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20455482602119446, + "epoch": 7.13, + "learning_rate": 1.4332206255283178e-05, + "loss": 0.3814, + "step": 8439, + "task_loss": 0.5300395488739014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3860228955745697, + "epoch": 7.13, + "learning_rate": 1.4327979712595097e-05, + "loss": 0.3239, + "step": 8440, + "task_loss": 1.090928554534912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5255444049835205, + "epoch": 7.14, + "learning_rate": 1.4323753169907016e-05, + "loss": 0.4216, + "step": 8441, + "task_loss": 0.32724031805992126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41834431886672974, + "epoch": 7.14, + "learning_rate": 1.4319526627218934e-05, + "loss": 0.4843, + "step": 8442, + "task_loss": 1.2157323360443115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7163252830505371, + "epoch": 7.14, + "learning_rate": 1.4315300084530856e-05, + "loss": 0.4413, + "step": 8443, + "task_loss": 0.743472158908844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5468880534172058, + "epoch": 7.14, + "learning_rate": 1.4311073541842774e-05, + "loss": 0.5176, + "step": 8444, + "task_loss": 0.5528901219367981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5406476259231567, + "epoch": 7.14, + "learning_rate": 1.4306846999154692e-05, + "loss": 0.4607, + "step": 8445, + "task_loss": 1.1136703491210938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20621834695339203, + "epoch": 7.14, + "learning_rate": 1.4302620456466612e-05, + "loss": 0.3089, + "step": 8446, + "task_loss": 0.13167735934257507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.417783260345459, + "epoch": 7.14, + "learning_rate": 1.429839391377853e-05, + "loss": 0.4804, + "step": 8447, + "task_loss": 0.5905869007110596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4282853901386261, + "epoch": 7.14, + "learning_rate": 1.429416737109045e-05, + "loss": 0.4501, + "step": 8448, + "task_loss": 0.9756159782409668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45216748118400574, + "epoch": 7.14, + "learning_rate": 1.4289940828402367e-05, + "loss": 0.261, + "step": 8449, + "task_loss": 1.13016676902771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4449964463710785, + "epoch": 7.14, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.3383, + "step": 8450, + "task_loss": 0.7582607269287109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4292390048503876, + "epoch": 7.14, + "learning_rate": 1.4281487743026207e-05, + "loss": 0.3779, + "step": 8451, + "task_loss": 0.2806881368160248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3443630039691925, + "epoch": 7.14, + "learning_rate": 1.4277261200338123e-05, + "loss": 0.4384, + "step": 8452, + "task_loss": 0.2718794047832489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2630396783351898, + "epoch": 7.15, + "learning_rate": 1.4273034657650041e-05, + "loss": 0.3079, + "step": 8453, + "task_loss": 0.5198727250099182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.267972469329834, + "epoch": 7.15, + "learning_rate": 1.4268808114961963e-05, + "loss": 0.4167, + "step": 8454, + "task_loss": 0.712524950504303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3115214705467224, + "epoch": 7.15, + "learning_rate": 1.426458157227388e-05, + "loss": 0.3991, + "step": 8455, + "task_loss": 0.6736319065093994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2567656934261322, + "epoch": 7.15, + "learning_rate": 1.42603550295858e-05, + "loss": 0.3718, + "step": 8456, + "task_loss": 0.3898475766181946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2431955337524414, + "epoch": 7.15, + "learning_rate": 1.4256128486897719e-05, + "loss": 0.392, + "step": 8457, + "task_loss": 0.5110139846801758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6100138425827026, + "epoch": 7.15, + "learning_rate": 1.4251901944209637e-05, + "loss": 0.4836, + "step": 8458, + "task_loss": 0.5632320642471313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6514887809753418, + "epoch": 7.15, + "learning_rate": 1.4247675401521556e-05, + "loss": 0.4821, + "step": 8459, + "task_loss": 0.8805688619613647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21099117398262024, + "epoch": 7.15, + "learning_rate": 1.4243448858833474e-05, + "loss": 0.3596, + "step": 8460, + "task_loss": 0.10741354525089264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5412999987602234, + "epoch": 7.15, + "learning_rate": 1.4239222316145392e-05, + "loss": 0.4048, + "step": 8461, + "task_loss": 0.8236729502677917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5572458505630493, + "epoch": 7.15, + "learning_rate": 1.4234995773457314e-05, + "loss": 0.3967, + "step": 8462, + "task_loss": 1.0607006549835205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5347450971603394, + "epoch": 7.15, + "learning_rate": 1.423076923076923e-05, + "loss": 0.4734, + "step": 8463, + "task_loss": 0.3777298927307129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3408353626728058, + "epoch": 7.15, + "learning_rate": 1.4226542688081152e-05, + "loss": 0.3535, + "step": 8464, + "task_loss": 0.7234189510345459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20471210777759552, + "epoch": 7.16, + "learning_rate": 1.422231614539307e-05, + "loss": 0.3814, + "step": 8465, + "task_loss": 0.606413722038269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.301734983921051, + "epoch": 7.16, + "learning_rate": 1.4218089602704988e-05, + "loss": 0.3655, + "step": 8466, + "task_loss": 0.08791442215442657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.232186958193779, + "epoch": 7.16, + "learning_rate": 1.4213863060016907e-05, + "loss": 0.3861, + "step": 8467, + "task_loss": 0.26512786746025085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34809571504592896, + "epoch": 7.16, + "learning_rate": 1.4209636517328825e-05, + "loss": 0.3458, + "step": 8468, + "task_loss": 0.28579840064048767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3929246664047241, + "epoch": 7.16, + "learning_rate": 1.4205409974640744e-05, + "loss": 0.3479, + "step": 8469, + "task_loss": 0.39567896723747253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19044481217861176, + "epoch": 7.16, + "learning_rate": 1.4201183431952663e-05, + "loss": 0.3302, + "step": 8470, + "task_loss": 0.47425198554992676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6478866338729858, + "epoch": 7.16, + "learning_rate": 1.4196956889264581e-05, + "loss": 0.5911, + "step": 8471, + "task_loss": 0.17720377445220947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2541215419769287, + "epoch": 7.16, + "learning_rate": 1.4192730346576503e-05, + "loss": 0.2974, + "step": 8472, + "task_loss": 0.1366140991449356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39000970125198364, + "epoch": 7.16, + "learning_rate": 1.4188503803888419e-05, + "loss": 0.3192, + "step": 8473, + "task_loss": 0.5380995273590088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29146015644073486, + "epoch": 7.16, + "learning_rate": 1.4184277261200337e-05, + "loss": 0.3622, + "step": 8474, + "task_loss": 0.7985250949859619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3090130090713501, + "epoch": 7.16, + "learning_rate": 1.4180050718512259e-05, + "loss": 0.4192, + "step": 8475, + "task_loss": 0.7839860916137695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40757304430007935, + "epoch": 7.16, + "learning_rate": 1.4175824175824177e-05, + "loss": 0.405, + "step": 8476, + "task_loss": 0.9707086682319641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2710322439670563, + "epoch": 7.17, + "learning_rate": 1.4171597633136096e-05, + "loss": 0.4355, + "step": 8477, + "task_loss": 0.974820613861084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40213143825531006, + "epoch": 7.17, + "learning_rate": 1.4167371090448014e-05, + "loss": 0.5155, + "step": 8478, + "task_loss": 1.0192489624023438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37129613757133484, + "epoch": 7.17, + "learning_rate": 1.4163144547759932e-05, + "loss": 0.3896, + "step": 8479, + "task_loss": 0.14015524089336395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30047017335891724, + "epoch": 7.17, + "learning_rate": 1.4158918005071852e-05, + "loss": 0.4828, + "step": 8480, + "task_loss": 0.43552565574645996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2602026164531708, + "epoch": 7.17, + "learning_rate": 1.415469146238377e-05, + "loss": 0.3499, + "step": 8481, + "task_loss": 0.764029324054718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4497409462928772, + "epoch": 7.17, + "learning_rate": 1.4150464919695688e-05, + "loss": 0.5215, + "step": 8482, + "task_loss": 0.7212861180305481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5065480470657349, + "epoch": 7.17, + "learning_rate": 1.414623837700761e-05, + "loss": 0.4278, + "step": 8483, + "task_loss": 0.47008541226387024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6751823425292969, + "epoch": 7.17, + "learning_rate": 1.4142011834319526e-05, + "loss": 0.432, + "step": 8484, + "task_loss": 0.43745505809783936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2547656297683716, + "epoch": 7.17, + "learning_rate": 1.4137785291631447e-05, + "loss": 0.3467, + "step": 8485, + "task_loss": 0.2236821949481964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19951918721199036, + "epoch": 7.17, + "learning_rate": 1.4133558748943366e-05, + "loss": 0.2963, + "step": 8486, + "task_loss": 0.10158511996269226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2329501360654831, + "epoch": 7.17, + "learning_rate": 1.4129332206255284e-05, + "loss": 0.5852, + "step": 8487, + "task_loss": 1.3158363103866577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4098413288593292, + "epoch": 7.17, + "learning_rate": 1.4125105663567203e-05, + "loss": 0.4492, + "step": 8488, + "task_loss": 0.6435500383377075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36278852820396423, + "epoch": 7.18, + "learning_rate": 1.4120879120879121e-05, + "loss": 0.3517, + "step": 8489, + "task_loss": 0.16912001371383667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23609913885593414, + "epoch": 7.18, + "learning_rate": 1.411665257819104e-05, + "loss": 0.4015, + "step": 8490, + "task_loss": 0.1170438826084137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6304333209991455, + "epoch": 7.18, + "learning_rate": 1.4112426035502959e-05, + "loss": 0.4668, + "step": 8491, + "task_loss": 0.8908385634422302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26012930274009705, + "epoch": 7.18, + "learning_rate": 1.4108199492814877e-05, + "loss": 0.4018, + "step": 8492, + "task_loss": 0.17272405326366425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3364792466163635, + "epoch": 7.18, + "learning_rate": 1.4103972950126799e-05, + "loss": 0.3985, + "step": 8493, + "task_loss": 0.27075305581092834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5751929879188538, + "epoch": 7.18, + "learning_rate": 1.4099746407438717e-05, + "loss": 0.5474, + "step": 8494, + "task_loss": 1.724953532218933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2538541555404663, + "epoch": 7.18, + "learning_rate": 1.4095519864750633e-05, + "loss": 0.337, + "step": 8495, + "task_loss": 0.5670956373214722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2530858516693115, + "epoch": 7.18, + "learning_rate": 1.4091293322062554e-05, + "loss": 0.2667, + "step": 8496, + "task_loss": 1.0100651979446411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.13354969024658203, + "epoch": 7.18, + "learning_rate": 1.4087066779374472e-05, + "loss": 0.4209, + "step": 8497, + "task_loss": 0.09509512037038803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24093709886074066, + "epoch": 7.18, + "learning_rate": 1.408284023668639e-05, + "loss": 0.3687, + "step": 8498, + "task_loss": 0.05606451630592346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2583601772785187, + "epoch": 7.18, + "learning_rate": 1.407861369399831e-05, + "loss": 0.4065, + "step": 8499, + "task_loss": 0.876528799533844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3202129900455475, + "epoch": 7.19, + "learning_rate": 1.4074387151310228e-05, + "loss": 0.2448, + "step": 8500, + "task_loss": 0.3950749337673187 + }, + { + "epoch": 7.19, + "eval_accuracy": 0.9123564356435644, + "eval_loss": 0.2746867537498474, + "eval_runtime": 226.0412, + "eval_samples_per_second": 111.705, + "eval_steps_per_second": 0.876, + "step": 8500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28804031014442444, + "epoch": 7.19, + "learning_rate": 1.4070160608622148e-05, + "loss": 0.355, + "step": 8501, + "task_loss": 0.6555396914482117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20275960862636566, + "epoch": 7.19, + "learning_rate": 1.4065934065934066e-05, + "loss": 0.3712, + "step": 8502, + "task_loss": 0.14004822075366974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2455885112285614, + "epoch": 7.19, + "learning_rate": 1.4061707523245984e-05, + "loss": 0.3485, + "step": 8503, + "task_loss": 0.32804399728775024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4071706235408783, + "epoch": 7.19, + "learning_rate": 1.4057480980557906e-05, + "loss": 0.4001, + "step": 8504, + "task_loss": 1.0248295068740845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33268868923187256, + "epoch": 7.19, + "learning_rate": 1.4053254437869822e-05, + "loss": 0.3663, + "step": 8505, + "task_loss": 0.42013972997665405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5348602533340454, + "epoch": 7.19, + "learning_rate": 1.4049027895181743e-05, + "loss": 0.3566, + "step": 8506, + "task_loss": 0.42829397320747375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4743642508983612, + "epoch": 7.19, + "learning_rate": 1.4044801352493661e-05, + "loss": 0.5073, + "step": 8507, + "task_loss": 0.6758993864059448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5060102939605713, + "epoch": 7.19, + "learning_rate": 1.404057480980558e-05, + "loss": 0.571, + "step": 8508, + "task_loss": 0.27656054496765137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4982975721359253, + "epoch": 7.19, + "learning_rate": 1.40363482671175e-05, + "loss": 0.4394, + "step": 8509, + "task_loss": 0.7105655074119568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22778832912445068, + "epoch": 7.19, + "learning_rate": 1.4032121724429417e-05, + "loss": 0.3419, + "step": 8510, + "task_loss": 0.7158111929893494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6181659698486328, + "epoch": 7.19, + "learning_rate": 1.4027895181741335e-05, + "loss": 0.6052, + "step": 8511, + "task_loss": 2.570383310317993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48055508732795715, + "epoch": 7.2, + "learning_rate": 1.4023668639053255e-05, + "loss": 0.3213, + "step": 8512, + "task_loss": 0.9995883703231812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24953579902648926, + "epoch": 7.2, + "learning_rate": 1.4019442096365173e-05, + "loss": 0.4412, + "step": 8513, + "task_loss": 0.46950456500053406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5099080801010132, + "epoch": 7.2, + "learning_rate": 1.4015215553677094e-05, + "loss": 0.5014, + "step": 8514, + "task_loss": 0.9196261167526245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2941228151321411, + "epoch": 7.2, + "learning_rate": 1.4010989010989013e-05, + "loss": 0.3823, + "step": 8515, + "task_loss": 0.4850535988807678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2971624433994293, + "epoch": 7.2, + "learning_rate": 1.4006762468300929e-05, + "loss": 0.3671, + "step": 8516, + "task_loss": 0.32122349739074707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4055556654930115, + "epoch": 7.2, + "learning_rate": 1.400253592561285e-05, + "loss": 0.4287, + "step": 8517, + "task_loss": 0.519234299659729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20970256626605988, + "epoch": 7.2, + "learning_rate": 1.3998309382924768e-05, + "loss": 0.3326, + "step": 8518, + "task_loss": 0.03902333602309227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3713664412498474, + "epoch": 7.2, + "learning_rate": 1.3994082840236686e-05, + "loss": 0.4008, + "step": 8519, + "task_loss": 0.14999502897262573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37278780341148376, + "epoch": 7.2, + "learning_rate": 1.3989856297548606e-05, + "loss": 0.3808, + "step": 8520, + "task_loss": 0.343439519405365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28966426849365234, + "epoch": 7.2, + "learning_rate": 1.3985629754860524e-05, + "loss": 0.3902, + "step": 8521, + "task_loss": 0.35030561685562134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2308727204799652, + "epoch": 7.2, + "learning_rate": 1.3981403212172444e-05, + "loss": 0.404, + "step": 8522, + "task_loss": 0.28466206789016724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4016810953617096, + "epoch": 7.2, + "learning_rate": 1.3977176669484362e-05, + "loss": 0.3862, + "step": 8523, + "task_loss": 0.39856624603271484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3083394169807434, + "epoch": 7.21, + "learning_rate": 1.397295012679628e-05, + "loss": 0.3712, + "step": 8524, + "task_loss": 0.6056357622146606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2287510633468628, + "epoch": 7.21, + "learning_rate": 1.3968723584108201e-05, + "loss": 0.3241, + "step": 8525, + "task_loss": 0.556786060333252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2634776532649994, + "epoch": 7.21, + "learning_rate": 1.396449704142012e-05, + "loss": 0.3582, + "step": 8526, + "task_loss": 0.4338427782058716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37449824810028076, + "epoch": 7.21, + "learning_rate": 1.3960270498732036e-05, + "loss": 0.361, + "step": 8527, + "task_loss": 0.8757673501968384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20253527164459229, + "epoch": 7.21, + "learning_rate": 1.3956043956043957e-05, + "loss": 0.4218, + "step": 8528, + "task_loss": 0.16213764250278473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3754189610481262, + "epoch": 7.21, + "learning_rate": 1.3951817413355875e-05, + "loss": 0.3449, + "step": 8529, + "task_loss": 0.06617217510938644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5472580790519714, + "epoch": 7.21, + "learning_rate": 1.3947590870667795e-05, + "loss": 0.3785, + "step": 8530, + "task_loss": 0.8132808804512024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2919911742210388, + "epoch": 7.21, + "learning_rate": 1.3943364327979713e-05, + "loss": 0.4958, + "step": 8531, + "task_loss": 0.37474772334098816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2878856062889099, + "epoch": 7.21, + "learning_rate": 1.3939137785291631e-05, + "loss": 0.4154, + "step": 8532, + "task_loss": 0.2587900161743164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28955981135368347, + "epoch": 7.21, + "learning_rate": 1.3934911242603551e-05, + "loss": 0.3374, + "step": 8533, + "task_loss": 0.47811076045036316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21137693524360657, + "epoch": 7.21, + "learning_rate": 1.3930684699915469e-05, + "loss": 0.4438, + "step": 8534, + "task_loss": 0.2500072121620178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4729272723197937, + "epoch": 7.21, + "learning_rate": 1.392645815722739e-05, + "loss": 0.4598, + "step": 8535, + "task_loss": 0.6682793498039246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30739715695381165, + "epoch": 7.22, + "learning_rate": 1.3922231614539308e-05, + "loss": 0.3144, + "step": 8536, + "task_loss": 0.8692510724067688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27924707531929016, + "epoch": 7.22, + "learning_rate": 1.3918005071851225e-05, + "loss": 0.2895, + "step": 8537, + "task_loss": 0.4635639786720276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30036357045173645, + "epoch": 7.22, + "learning_rate": 1.3913778529163146e-05, + "loss": 0.3438, + "step": 8538, + "task_loss": 0.5060112476348877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4012424051761627, + "epoch": 7.22, + "learning_rate": 1.3909551986475064e-05, + "loss": 0.4482, + "step": 8539, + "task_loss": 0.772833526134491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4473796486854553, + "epoch": 7.22, + "learning_rate": 1.3905325443786982e-05, + "loss": 0.394, + "step": 8540, + "task_loss": 0.34098735451698303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40113675594329834, + "epoch": 7.22, + "learning_rate": 1.3901098901098902e-05, + "loss": 0.5206, + "step": 8541, + "task_loss": 0.21092194318771362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3019765615463257, + "epoch": 7.22, + "learning_rate": 1.389687235841082e-05, + "loss": 0.3286, + "step": 8542, + "task_loss": 0.235370472073555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47698116302490234, + "epoch": 7.22, + "learning_rate": 1.3892645815722741e-05, + "loss": 0.4689, + "step": 8543, + "task_loss": 0.3217032551765442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3377663493156433, + "epoch": 7.22, + "learning_rate": 1.3888419273034658e-05, + "loss": 0.3652, + "step": 8544, + "task_loss": 0.41617658734321594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29802316427230835, + "epoch": 7.22, + "learning_rate": 1.3884192730346576e-05, + "loss": 0.3668, + "step": 8545, + "task_loss": 0.5862835049629211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5737792253494263, + "epoch": 7.22, + "learning_rate": 1.3879966187658497e-05, + "loss": 0.5202, + "step": 8546, + "task_loss": 0.6241216659545898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3956802487373352, + "epoch": 7.22, + "learning_rate": 1.3875739644970415e-05, + "loss": 0.4122, + "step": 8547, + "task_loss": 1.2210131883621216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4121933579444885, + "epoch": 7.23, + "learning_rate": 1.3871513102282332e-05, + "loss": 0.5259, + "step": 8548, + "task_loss": 0.5457651019096375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4244515895843506, + "epoch": 7.23, + "learning_rate": 1.3867286559594253e-05, + "loss": 0.3731, + "step": 8549, + "task_loss": 0.20137014985084534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5554964542388916, + "epoch": 7.23, + "learning_rate": 1.3863060016906171e-05, + "loss": 0.461, + "step": 8550, + "task_loss": 0.3971710205078125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40970927476882935, + "epoch": 7.23, + "learning_rate": 1.3858833474218091e-05, + "loss": 0.3715, + "step": 8551, + "task_loss": 0.9400939345359802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4315983057022095, + "epoch": 7.23, + "learning_rate": 1.3854606931530009e-05, + "loss": 0.3273, + "step": 8552, + "task_loss": 0.7486255168914795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28100165724754333, + "epoch": 7.23, + "learning_rate": 1.3850380388841927e-05, + "loss": 0.3264, + "step": 8553, + "task_loss": 0.43455350399017334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17266938090324402, + "epoch": 7.23, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.3853, + "step": 8554, + "task_loss": 0.24986565113067627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.479789674282074, + "epoch": 7.23, + "learning_rate": 1.3841927303465765e-05, + "loss": 0.5231, + "step": 8555, + "task_loss": 0.224117711186409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34944698214530945, + "epoch": 7.23, + "learning_rate": 1.3837700760777683e-05, + "loss": 0.3907, + "step": 8556, + "task_loss": 0.9573411345481873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5413358211517334, + "epoch": 7.23, + "learning_rate": 1.3833474218089604e-05, + "loss": 0.5717, + "step": 8557, + "task_loss": 0.7521502375602722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5187146663665771, + "epoch": 7.23, + "learning_rate": 1.3829247675401522e-05, + "loss": 0.3791, + "step": 8558, + "task_loss": 0.7211936712265015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3217013478279114, + "epoch": 7.23, + "learning_rate": 1.3825021132713442e-05, + "loss": 0.3773, + "step": 8559, + "task_loss": 0.184209942817688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3858310580253601, + "epoch": 7.24, + "learning_rate": 1.382079459002536e-05, + "loss": 0.3311, + "step": 8560, + "task_loss": 0.5542607307434082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30691802501678467, + "epoch": 7.24, + "learning_rate": 1.3816568047337278e-05, + "loss": 0.3789, + "step": 8561, + "task_loss": 0.5664929747581482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36254364252090454, + "epoch": 7.24, + "learning_rate": 1.3812341504649198e-05, + "loss": 0.418, + "step": 8562, + "task_loss": 0.4081762135028839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24940970540046692, + "epoch": 7.24, + "learning_rate": 1.3808114961961116e-05, + "loss": 0.3356, + "step": 8563, + "task_loss": 0.05239592120051384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5555640459060669, + "epoch": 7.24, + "learning_rate": 1.3803888419273037e-05, + "loss": 0.5096, + "step": 8564, + "task_loss": 1.212148666381836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4263947606086731, + "epoch": 7.24, + "learning_rate": 1.3799661876584954e-05, + "loss": 0.4928, + "step": 8565, + "task_loss": 1.0455942153930664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3000696301460266, + "epoch": 7.24, + "learning_rate": 1.3795435333896872e-05, + "loss": 0.5257, + "step": 8566, + "task_loss": 0.4034479856491089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30817484855651855, + "epoch": 7.24, + "learning_rate": 1.3791208791208793e-05, + "loss": 0.3162, + "step": 8567, + "task_loss": 0.34903639554977417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29415959119796753, + "epoch": 7.24, + "learning_rate": 1.3786982248520711e-05, + "loss": 0.3466, + "step": 8568, + "task_loss": 0.6986730098724365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34249183535575867, + "epoch": 7.24, + "learning_rate": 1.3782755705832628e-05, + "loss": 0.3542, + "step": 8569, + "task_loss": 1.0983505249023438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38892924785614014, + "epoch": 7.24, + "learning_rate": 1.3778529163144549e-05, + "loss": 0.4218, + "step": 8570, + "task_loss": 0.30498379468917847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4897609353065491, + "epoch": 7.24, + "learning_rate": 1.3774302620456467e-05, + "loss": 0.4079, + "step": 8571, + "task_loss": 0.8633142709732056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4524924159049988, + "epoch": 7.25, + "learning_rate": 1.3770076077768387e-05, + "loss": 0.3466, + "step": 8572, + "task_loss": 0.7186328172683716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3173494338989258, + "epoch": 7.25, + "learning_rate": 1.3765849535080305e-05, + "loss": 0.4324, + "step": 8573, + "task_loss": 1.0681122541427612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5366412401199341, + "epoch": 7.25, + "learning_rate": 1.3761622992392223e-05, + "loss": 0.5694, + "step": 8574, + "task_loss": 0.5369795560836792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2759028375148773, + "epoch": 7.25, + "learning_rate": 1.3757396449704144e-05, + "loss": 0.4325, + "step": 8575, + "task_loss": 0.08812876790761948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6899816989898682, + "epoch": 7.25, + "learning_rate": 1.375316990701606e-05, + "loss": 0.4896, + "step": 8576, + "task_loss": 0.4380262792110443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5933284163475037, + "epoch": 7.25, + "learning_rate": 1.3748943364327979e-05, + "loss": 0.3765, + "step": 8577, + "task_loss": 0.6137198805809021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4504917562007904, + "epoch": 7.25, + "learning_rate": 1.37447168216399e-05, + "loss": 0.4347, + "step": 8578, + "task_loss": 1.3623602390289307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2852308750152588, + "epoch": 7.25, + "learning_rate": 1.3740490278951818e-05, + "loss": 0.4276, + "step": 8579, + "task_loss": 0.21618777513504028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4170371890068054, + "epoch": 7.25, + "learning_rate": 1.3736263736263738e-05, + "loss": 0.5685, + "step": 8580, + "task_loss": 1.2817444801330566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41002053022384644, + "epoch": 7.25, + "learning_rate": 1.3732037193575656e-05, + "loss": 0.3637, + "step": 8581, + "task_loss": 0.8383623361587524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3668299615383148, + "epoch": 7.25, + "learning_rate": 1.3727810650887574e-05, + "loss": 0.3804, + "step": 8582, + "task_loss": 0.6670367121696472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30578210949897766, + "epoch": 7.26, + "learning_rate": 1.3723584108199494e-05, + "loss": 0.3686, + "step": 8583, + "task_loss": 0.14470511674880981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7104932069778442, + "epoch": 7.26, + "learning_rate": 1.3719357565511412e-05, + "loss": 0.4109, + "step": 8584, + "task_loss": 1.0525892972946167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2472262680530548, + "epoch": 7.26, + "learning_rate": 1.371513102282333e-05, + "loss": 0.264, + "step": 8585, + "task_loss": 0.3102058470249176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2992400825023651, + "epoch": 7.26, + "learning_rate": 1.371090448013525e-05, + "loss": 0.3902, + "step": 8586, + "task_loss": 1.0000417232513428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.397636741399765, + "epoch": 7.26, + "learning_rate": 1.3706677937447168e-05, + "loss": 0.3431, + "step": 8587, + "task_loss": 0.38747352361679077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3389577269554138, + "epoch": 7.26, + "learning_rate": 1.3702451394759089e-05, + "loss": 0.3182, + "step": 8588, + "task_loss": 0.7572832703590393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3555694818496704, + "epoch": 7.26, + "learning_rate": 1.3698224852071007e-05, + "loss": 0.2956, + "step": 8589, + "task_loss": 0.9081255197525024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5013615489006042, + "epoch": 7.26, + "learning_rate": 1.3693998309382925e-05, + "loss": 0.4981, + "step": 8590, + "task_loss": 1.1445088386535645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25971195101737976, + "epoch": 7.26, + "learning_rate": 1.3689771766694845e-05, + "loss": 0.3105, + "step": 8591, + "task_loss": 0.6002896428108215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43126654624938965, + "epoch": 7.26, + "learning_rate": 1.3685545224006763e-05, + "loss": 0.4341, + "step": 8592, + "task_loss": 0.22942760586738586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3452215790748596, + "epoch": 7.26, + "learning_rate": 1.3681318681318683e-05, + "loss": 0.3979, + "step": 8593, + "task_loss": 1.3989520072937012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2451454997062683, + "epoch": 7.26, + "learning_rate": 1.36770921386306e-05, + "loss": 0.3929, + "step": 8594, + "task_loss": 0.78183513879776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3263765275478363, + "epoch": 7.27, + "learning_rate": 1.3672865595942519e-05, + "loss": 0.3592, + "step": 8595, + "task_loss": 0.24866414070129395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42782655358314514, + "epoch": 7.27, + "learning_rate": 1.366863905325444e-05, + "loss": 0.3086, + "step": 8596, + "task_loss": 0.4088786840438843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35576164722442627, + "epoch": 7.27, + "learning_rate": 1.3664412510566357e-05, + "loss": 0.4195, + "step": 8597, + "task_loss": 0.6562376022338867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2822190523147583, + "epoch": 7.27, + "learning_rate": 1.3660185967878275e-05, + "loss": 0.4104, + "step": 8598, + "task_loss": 0.04291326552629471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4295153319835663, + "epoch": 7.27, + "learning_rate": 1.3655959425190196e-05, + "loss": 0.4032, + "step": 8599, + "task_loss": 0.5472261905670166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3212353587150574, + "epoch": 7.27, + "learning_rate": 1.3651732882502114e-05, + "loss": 0.4338, + "step": 8600, + "task_loss": 1.1058002710342407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28444987535476685, + "epoch": 7.27, + "learning_rate": 1.3647506339814034e-05, + "loss": 0.3154, + "step": 8601, + "task_loss": 0.12570720911026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25766852498054504, + "epoch": 7.27, + "learning_rate": 1.3643279797125952e-05, + "loss": 0.2679, + "step": 8602, + "task_loss": 0.12526074051856995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3514648675918579, + "epoch": 7.27, + "learning_rate": 1.363905325443787e-05, + "loss": 0.3528, + "step": 8603, + "task_loss": 0.8331689238548279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3361646234989166, + "epoch": 7.27, + "learning_rate": 1.363482671174979e-05, + "loss": 0.3919, + "step": 8604, + "task_loss": 0.5955148339271545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3545660376548767, + "epoch": 7.27, + "learning_rate": 1.3630600169061708e-05, + "loss": 0.3839, + "step": 8605, + "task_loss": 1.0678670406341553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5521379709243774, + "epoch": 7.27, + "learning_rate": 1.3626373626373626e-05, + "loss": 0.5002, + "step": 8606, + "task_loss": 0.6152999401092529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4553637206554413, + "epoch": 7.28, + "learning_rate": 1.3622147083685547e-05, + "loss": 0.4117, + "step": 8607, + "task_loss": 0.19236235320568085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.540372908115387, + "epoch": 7.28, + "learning_rate": 1.3617920540997464e-05, + "loss": 0.4247, + "step": 8608, + "task_loss": 1.370190978050232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2881808876991272, + "epoch": 7.28, + "learning_rate": 1.3613693998309385e-05, + "loss": 0.3531, + "step": 8609, + "task_loss": 0.6731514930725098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25387847423553467, + "epoch": 7.28, + "learning_rate": 1.3609467455621303e-05, + "loss": 0.4682, + "step": 8610, + "task_loss": 0.9486519694328308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30359694361686707, + "epoch": 7.28, + "learning_rate": 1.3605240912933221e-05, + "loss": 0.3656, + "step": 8611, + "task_loss": 0.46465539932250977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16782517731189728, + "epoch": 7.28, + "learning_rate": 1.360101437024514e-05, + "loss": 0.4961, + "step": 8612, + "task_loss": 0.08161499351263046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7686130404472351, + "epoch": 7.28, + "learning_rate": 1.3596787827557059e-05, + "loss": 0.497, + "step": 8613, + "task_loss": 0.8932656049728394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27487850189208984, + "epoch": 7.28, + "learning_rate": 1.3592561284868977e-05, + "loss": 0.3704, + "step": 8614, + "task_loss": 0.8220574259757996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3330289423465729, + "epoch": 7.28, + "learning_rate": 1.3588334742180897e-05, + "loss": 0.5226, + "step": 8615, + "task_loss": 0.7364484071731567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25871479511260986, + "epoch": 7.28, + "learning_rate": 1.3584108199492815e-05, + "loss": 0.3592, + "step": 8616, + "task_loss": 0.9599449038505554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33043479919433594, + "epoch": 7.28, + "learning_rate": 1.3579881656804736e-05, + "loss": 0.4356, + "step": 8617, + "task_loss": 0.6811276078224182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5672525763511658, + "epoch": 7.28, + "learning_rate": 1.3575655114116652e-05, + "loss": 0.4662, + "step": 8618, + "task_loss": 0.9465789794921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21204985678195953, + "epoch": 7.29, + "learning_rate": 1.357142857142857e-05, + "loss": 0.4529, + "step": 8619, + "task_loss": 0.12446191161870956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3419177532196045, + "epoch": 7.29, + "learning_rate": 1.3567202028740492e-05, + "loss": 0.3171, + "step": 8620, + "task_loss": 0.813675582408905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25606483221054077, + "epoch": 7.29, + "learning_rate": 1.356297548605241e-05, + "loss": 0.3336, + "step": 8621, + "task_loss": 0.05582047253847122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8029910922050476, + "epoch": 7.29, + "learning_rate": 1.355874894336433e-05, + "loss": 0.5935, + "step": 8622, + "task_loss": 0.787638783454895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40030235052108765, + "epoch": 7.29, + "learning_rate": 1.3554522400676248e-05, + "loss": 0.4107, + "step": 8623, + "task_loss": 0.6856584548950195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3166125416755676, + "epoch": 7.29, + "learning_rate": 1.3550295857988166e-05, + "loss": 0.4431, + "step": 8624, + "task_loss": 0.31369078159332275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.409542441368103, + "epoch": 7.29, + "learning_rate": 1.3546069315300086e-05, + "loss": 0.3557, + "step": 8625, + "task_loss": 0.38980358839035034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3922196626663208, + "epoch": 7.29, + "learning_rate": 1.3541842772612004e-05, + "loss": 0.3394, + "step": 8626, + "task_loss": 0.6547351479530334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48632580041885376, + "epoch": 7.29, + "learning_rate": 1.3537616229923922e-05, + "loss": 0.3663, + "step": 8627, + "task_loss": 0.4329163432121277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1942305862903595, + "epoch": 7.29, + "learning_rate": 1.3533389687235843e-05, + "loss": 0.355, + "step": 8628, + "task_loss": 0.723336935043335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4070796072483063, + "epoch": 7.29, + "learning_rate": 1.352916314454776e-05, + "loss": 0.4961, + "step": 8629, + "task_loss": 1.051347017288208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4013592302799225, + "epoch": 7.29, + "learning_rate": 1.352493660185968e-05, + "loss": 0.3931, + "step": 8630, + "task_loss": 0.47482162714004517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19653481245040894, + "epoch": 7.3, + "learning_rate": 1.3520710059171599e-05, + "loss": 0.2683, + "step": 8631, + "task_loss": 0.025963615626096725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44278013706207275, + "epoch": 7.3, + "learning_rate": 1.3516483516483517e-05, + "loss": 0.3423, + "step": 8632, + "task_loss": 0.2865676283836365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4132575988769531, + "epoch": 7.3, + "learning_rate": 1.3512256973795437e-05, + "loss": 0.3011, + "step": 8633, + "task_loss": 0.505929172039032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3711126446723938, + "epoch": 7.3, + "learning_rate": 1.3508030431107355e-05, + "loss": 0.474, + "step": 8634, + "task_loss": 0.3926868140697479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2883041501045227, + "epoch": 7.3, + "learning_rate": 1.3503803888419273e-05, + "loss": 0.4002, + "step": 8635, + "task_loss": 0.9216748476028442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3653143048286438, + "epoch": 7.3, + "learning_rate": 1.3499577345731192e-05, + "loss": 0.3633, + "step": 8636, + "task_loss": 0.4034796357154846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3774307370185852, + "epoch": 7.3, + "learning_rate": 1.349535080304311e-05, + "loss": 0.401, + "step": 8637, + "task_loss": 0.7192955613136292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2929084599018097, + "epoch": 7.3, + "learning_rate": 1.3491124260355032e-05, + "loss": 0.2942, + "step": 8638, + "task_loss": 0.26929783821105957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3468337655067444, + "epoch": 7.3, + "learning_rate": 1.348689771766695e-05, + "loss": 0.3722, + "step": 8639, + "task_loss": 0.2371395081281662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4155738949775696, + "epoch": 7.3, + "learning_rate": 1.3482671174978866e-05, + "loss": 0.4063, + "step": 8640, + "task_loss": 0.6588971614837646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20427995920181274, + "epoch": 7.3, + "learning_rate": 1.3478444632290788e-05, + "loss": 0.3478, + "step": 8641, + "task_loss": 0.8175039291381836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31295791268348694, + "epoch": 7.3, + "learning_rate": 1.3474218089602706e-05, + "loss": 0.2734, + "step": 8642, + "task_loss": 0.5994687676429749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6649050712585449, + "epoch": 7.31, + "learning_rate": 1.3469991546914624e-05, + "loss": 0.4422, + "step": 8643, + "task_loss": 0.4326794147491455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28210699558258057, + "epoch": 7.31, + "learning_rate": 1.3465765004226544e-05, + "loss": 0.3455, + "step": 8644, + "task_loss": 0.46208304166793823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27043116092681885, + "epoch": 7.31, + "learning_rate": 1.3461538461538462e-05, + "loss": 0.3437, + "step": 8645, + "task_loss": 0.0951174944639206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.552838146686554, + "epoch": 7.31, + "learning_rate": 1.3457311918850381e-05, + "loss": 0.4889, + "step": 8646, + "task_loss": 0.821312427520752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.249177485704422, + "epoch": 7.31, + "learning_rate": 1.34530853761623e-05, + "loss": 0.3387, + "step": 8647, + "task_loss": 0.364907443523407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28236812353134155, + "epoch": 7.31, + "learning_rate": 1.3448858833474217e-05, + "loss": 0.311, + "step": 8648, + "task_loss": 0.14876903593540192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48333948850631714, + "epoch": 7.31, + "learning_rate": 1.3444632290786139e-05, + "loss": 0.4965, + "step": 8649, + "task_loss": 0.4771033525466919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40405696630477905, + "epoch": 7.31, + "learning_rate": 1.3440405748098055e-05, + "loss": 0.4487, + "step": 8650, + "task_loss": 1.2152924537658691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34432879090309143, + "epoch": 7.31, + "learning_rate": 1.3436179205409977e-05, + "loss": 0.4606, + "step": 8651, + "task_loss": 0.45158663392066956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3807457685470581, + "epoch": 7.31, + "learning_rate": 1.3431952662721895e-05, + "loss": 0.4927, + "step": 8652, + "task_loss": 1.035140037536621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37602412700653076, + "epoch": 7.31, + "learning_rate": 1.3427726120033813e-05, + "loss": 0.3116, + "step": 8653, + "task_loss": 0.7416728138923645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3265550434589386, + "epoch": 7.32, + "learning_rate": 1.3423499577345733e-05, + "loss": 0.3903, + "step": 8654, + "task_loss": 0.9113268256187439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3064642548561096, + "epoch": 7.32, + "learning_rate": 1.341927303465765e-05, + "loss": 0.3587, + "step": 8655, + "task_loss": 0.5006517171859741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4127204418182373, + "epoch": 7.32, + "learning_rate": 1.3415046491969569e-05, + "loss": 0.3644, + "step": 8656, + "task_loss": 1.1537973880767822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41386687755584717, + "epoch": 7.32, + "learning_rate": 1.3410819949281488e-05, + "loss": 0.3883, + "step": 8657, + "task_loss": 0.1646079421043396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20974020659923553, + "epoch": 7.32, + "learning_rate": 1.3406593406593406e-05, + "loss": 0.3376, + "step": 8658, + "task_loss": 1.1433213949203491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4548261761665344, + "epoch": 7.32, + "learning_rate": 1.3402366863905328e-05, + "loss": 0.3723, + "step": 8659, + "task_loss": 0.23796841502189636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5329951047897339, + "epoch": 7.32, + "learning_rate": 1.3398140321217246e-05, + "loss": 0.3801, + "step": 8660, + "task_loss": 1.2379002571105957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4140164852142334, + "epoch": 7.32, + "learning_rate": 1.3393913778529162e-05, + "loss": 0.4092, + "step": 8661, + "task_loss": 0.6631278991699219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3560425639152527, + "epoch": 7.32, + "learning_rate": 1.3389687235841084e-05, + "loss": 0.3162, + "step": 8662, + "task_loss": 0.15443722903728485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29310721158981323, + "epoch": 7.32, + "learning_rate": 1.3385460693153002e-05, + "loss": 0.336, + "step": 8663, + "task_loss": 0.3120836317539215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4309068024158478, + "epoch": 7.32, + "learning_rate": 1.338123415046492e-05, + "loss": 0.5265, + "step": 8664, + "task_loss": 0.6378076076507568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3921373784542084, + "epoch": 7.32, + "learning_rate": 1.337700760777684e-05, + "loss": 0.4362, + "step": 8665, + "task_loss": 0.9705500602722168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4018746614456177, + "epoch": 7.33, + "learning_rate": 1.3372781065088758e-05, + "loss": 0.416, + "step": 8666, + "task_loss": 0.7478629946708679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42230039834976196, + "epoch": 7.33, + "learning_rate": 1.3368554522400677e-05, + "loss": 0.4388, + "step": 8667, + "task_loss": 0.18032880127429962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4468831717967987, + "epoch": 7.33, + "learning_rate": 1.3364327979712595e-05, + "loss": 0.4433, + "step": 8668, + "task_loss": 0.9867525696754456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2717626690864563, + "epoch": 7.33, + "learning_rate": 1.3360101437024513e-05, + "loss": 0.4367, + "step": 8669, + "task_loss": 1.1541475057601929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40353021025657654, + "epoch": 7.33, + "learning_rate": 1.3355874894336435e-05, + "loss": 0.3665, + "step": 8670, + "task_loss": 0.7211099863052368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2924869954586029, + "epoch": 7.33, + "learning_rate": 1.3351648351648353e-05, + "loss": 0.4751, + "step": 8671, + "task_loss": 0.8653159141540527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2930752635002136, + "epoch": 7.33, + "learning_rate": 1.334742180896027e-05, + "loss": 0.3487, + "step": 8672, + "task_loss": 0.8559185862541199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4897002577781677, + "epoch": 7.33, + "learning_rate": 1.334319526627219e-05, + "loss": 0.4671, + "step": 8673, + "task_loss": 0.6756983995437622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.211037740111351, + "epoch": 7.33, + "learning_rate": 1.3338968723584109e-05, + "loss": 0.3556, + "step": 8674, + "task_loss": 0.6291630864143372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3360767960548401, + "epoch": 7.33, + "learning_rate": 1.3334742180896028e-05, + "loss": 0.3478, + "step": 8675, + "task_loss": 0.669925332069397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4772075116634369, + "epoch": 7.33, + "learning_rate": 1.3330515638207946e-05, + "loss": 0.3986, + "step": 8676, + "task_loss": 0.6321662664413452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24405543506145477, + "epoch": 7.33, + "learning_rate": 1.3326289095519864e-05, + "loss": 0.3657, + "step": 8677, + "task_loss": 0.1629209816455841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4331558644771576, + "epoch": 7.34, + "learning_rate": 1.3322062552831784e-05, + "loss": 0.3649, + "step": 8678, + "task_loss": 1.1110867261886597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3111623525619507, + "epoch": 7.34, + "learning_rate": 1.3317836010143702e-05, + "loss": 0.4241, + "step": 8679, + "task_loss": 0.8777350187301636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.500487208366394, + "epoch": 7.34, + "learning_rate": 1.3313609467455624e-05, + "loss": 0.4438, + "step": 8680, + "task_loss": 0.9502843022346497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28063181042671204, + "epoch": 7.34, + "learning_rate": 1.3309382924767542e-05, + "loss": 0.4563, + "step": 8681, + "task_loss": 0.5501363277435303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43725597858428955, + "epoch": 7.34, + "learning_rate": 1.3305156382079458e-05, + "loss": 0.3996, + "step": 8682, + "task_loss": 1.4438990354537964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3341028392314911, + "epoch": 7.34, + "learning_rate": 1.330092983939138e-05, + "loss": 0.2957, + "step": 8683, + "task_loss": 0.5016899704933167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45649781823158264, + "epoch": 7.34, + "learning_rate": 1.3296703296703298e-05, + "loss": 0.2901, + "step": 8684, + "task_loss": 0.8486303091049194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45747965574264526, + "epoch": 7.34, + "learning_rate": 1.3292476754015216e-05, + "loss": 0.4244, + "step": 8685, + "task_loss": 0.47018468379974365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45868751406669617, + "epoch": 7.34, + "learning_rate": 1.3288250211327135e-05, + "loss": 0.4547, + "step": 8686, + "task_loss": 0.7005316615104675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3640376925468445, + "epoch": 7.34, + "learning_rate": 1.3284023668639053e-05, + "loss": 0.4192, + "step": 8687, + "task_loss": 0.13469792902469635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18828928470611572, + "epoch": 7.34, + "learning_rate": 1.3279797125950975e-05, + "loss": 0.3303, + "step": 8688, + "task_loss": 0.5118227005004883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2525501847267151, + "epoch": 7.34, + "learning_rate": 1.3275570583262891e-05, + "loss": 0.2949, + "step": 8689, + "task_loss": 0.3184832036495209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4442913234233856, + "epoch": 7.35, + "learning_rate": 1.327134404057481e-05, + "loss": 0.5297, + "step": 8690, + "task_loss": 0.5667042136192322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26817652583122253, + "epoch": 7.35, + "learning_rate": 1.326711749788673e-05, + "loss": 0.4387, + "step": 8691, + "task_loss": 0.27988946437835693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.213149756193161, + "epoch": 7.35, + "learning_rate": 1.3262890955198649e-05, + "loss": 0.3333, + "step": 8692, + "task_loss": 0.051734503358602524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.410963237285614, + "epoch": 7.35, + "learning_rate": 1.3258664412510565e-05, + "loss": 0.3889, + "step": 8693, + "task_loss": 0.6384717226028442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3206910192966461, + "epoch": 7.35, + "learning_rate": 1.3254437869822487e-05, + "loss": 0.3926, + "step": 8694, + "task_loss": 0.5632293224334717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49062347412109375, + "epoch": 7.35, + "learning_rate": 1.3250211327134405e-05, + "loss": 0.5304, + "step": 8695, + "task_loss": 0.5054396986961365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5377039909362793, + "epoch": 7.35, + "learning_rate": 1.3245984784446324e-05, + "loss": 0.4534, + "step": 8696, + "task_loss": 0.4006824493408203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5226354598999023, + "epoch": 7.35, + "learning_rate": 1.3241758241758242e-05, + "loss": 0.51, + "step": 8697, + "task_loss": 0.7198591232299805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3524794280529022, + "epoch": 7.35, + "learning_rate": 1.323753169907016e-05, + "loss": 0.4425, + "step": 8698, + "task_loss": 0.7367194294929504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43069273233413696, + "epoch": 7.35, + "learning_rate": 1.323330515638208e-05, + "loss": 0.3887, + "step": 8699, + "task_loss": 0.6201344728469849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38378632068634033, + "epoch": 7.35, + "learning_rate": 1.3229078613693998e-05, + "loss": 0.4201, + "step": 8700, + "task_loss": 0.7699970006942749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40096601843833923, + "epoch": 7.35, + "learning_rate": 1.3224852071005916e-05, + "loss": 0.3627, + "step": 8701, + "task_loss": 1.081023931503296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27769097685813904, + "epoch": 7.36, + "learning_rate": 1.3220625528317838e-05, + "loss": 0.4298, + "step": 8702, + "task_loss": 0.21367943286895752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2748101055622101, + "epoch": 7.36, + "learning_rate": 1.3216398985629756e-05, + "loss": 0.3124, + "step": 8703, + "task_loss": 0.17865511775016785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4741690754890442, + "epoch": 7.36, + "learning_rate": 1.3212172442941675e-05, + "loss": 0.3848, + "step": 8704, + "task_loss": 0.7435306310653687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2573087215423584, + "epoch": 7.36, + "learning_rate": 1.3207945900253593e-05, + "loss": 0.3032, + "step": 8705, + "task_loss": 1.0739707946777344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45296263694763184, + "epoch": 7.36, + "learning_rate": 1.3203719357565512e-05, + "loss": 0.5639, + "step": 8706, + "task_loss": 0.6117860078811646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4623570442199707, + "epoch": 7.36, + "learning_rate": 1.3199492814877431e-05, + "loss": 0.3687, + "step": 8707, + "task_loss": 0.7984516024589539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5325337648391724, + "epoch": 7.36, + "learning_rate": 1.319526627218935e-05, + "loss": 0.3462, + "step": 8708, + "task_loss": 1.0781035423278809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3728008270263672, + "epoch": 7.36, + "learning_rate": 1.3191039729501267e-05, + "loss": 0.3046, + "step": 8709, + "task_loss": 0.40966176986694336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2892404794692993, + "epoch": 7.36, + "learning_rate": 1.3186813186813187e-05, + "loss": 0.3457, + "step": 8710, + "task_loss": 0.7240236401557922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2686305046081543, + "epoch": 7.36, + "learning_rate": 1.3182586644125105e-05, + "loss": 0.4605, + "step": 8711, + "task_loss": 0.4243745505809784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5931345224380493, + "epoch": 7.36, + "learning_rate": 1.3178360101437027e-05, + "loss": 0.4884, + "step": 8712, + "task_loss": 0.1295870840549469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5213711857795715, + "epoch": 7.36, + "learning_rate": 1.3174133558748945e-05, + "loss": 0.4058, + "step": 8713, + "task_loss": 0.24551443755626678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5323390364646912, + "epoch": 7.37, + "learning_rate": 1.3169907016060861e-05, + "loss": 0.3589, + "step": 8714, + "task_loss": 0.62319415807724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23142176866531372, + "epoch": 7.37, + "learning_rate": 1.3165680473372782e-05, + "loss": 0.497, + "step": 8715, + "task_loss": 0.09799226373434067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4250490069389343, + "epoch": 7.37, + "learning_rate": 1.31614539306847e-05, + "loss": 0.3637, + "step": 8716, + "task_loss": 0.578825056552887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3273549973964691, + "epoch": 7.37, + "learning_rate": 1.315722738799662e-05, + "loss": 0.4016, + "step": 8717, + "task_loss": 0.21588104963302612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5098577737808228, + "epoch": 7.37, + "learning_rate": 1.3153000845308538e-05, + "loss": 0.5196, + "step": 8718, + "task_loss": 0.8958927989006042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32574883103370667, + "epoch": 7.37, + "learning_rate": 1.3148774302620456e-05, + "loss": 0.3875, + "step": 8719, + "task_loss": 0.4246179759502411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3771357536315918, + "epoch": 7.37, + "learning_rate": 1.3144547759932378e-05, + "loss": 0.3966, + "step": 8720, + "task_loss": 0.6820191144943237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33946099877357483, + "epoch": 7.37, + "learning_rate": 1.3140321217244294e-05, + "loss": 0.4666, + "step": 8721, + "task_loss": 0.06475155055522919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3084554374217987, + "epoch": 7.37, + "learning_rate": 1.3136094674556212e-05, + "loss": 0.4377, + "step": 8722, + "task_loss": 0.7589787244796753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3583388328552246, + "epoch": 7.37, + "learning_rate": 1.3131868131868134e-05, + "loss": 0.3033, + "step": 8723, + "task_loss": 0.39629092812538147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37276482582092285, + "epoch": 7.37, + "learning_rate": 1.3127641589180052e-05, + "loss": 0.393, + "step": 8724, + "task_loss": 0.26068851351737976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7178219556808472, + "epoch": 7.38, + "learning_rate": 1.3123415046491971e-05, + "loss": 0.4809, + "step": 8725, + "task_loss": 0.8537161946296692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5248575210571289, + "epoch": 7.38, + "learning_rate": 1.311918850380389e-05, + "loss": 0.4028, + "step": 8726, + "task_loss": 1.1732163429260254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41542187333106995, + "epoch": 7.38, + "learning_rate": 1.3114961961115807e-05, + "loss": 0.4006, + "step": 8727, + "task_loss": 0.33321696519851685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35913047194480896, + "epoch": 7.38, + "learning_rate": 1.3110735418427727e-05, + "loss": 0.3435, + "step": 8728, + "task_loss": 0.7797046899795532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.265575647354126, + "epoch": 7.38, + "learning_rate": 1.3106508875739645e-05, + "loss": 0.3489, + "step": 8729, + "task_loss": 0.42789167165756226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7339180707931519, + "epoch": 7.38, + "learning_rate": 1.3102282333051563e-05, + "loss": 0.5076, + "step": 8730, + "task_loss": 0.5267307758331299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2875024080276489, + "epoch": 7.38, + "learning_rate": 1.3098055790363483e-05, + "loss": 0.3845, + "step": 8731, + "task_loss": 0.4913184344768524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47813326120376587, + "epoch": 7.38, + "learning_rate": 1.3093829247675401e-05, + "loss": 0.3513, + "step": 8732, + "task_loss": 0.49101531505584717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48580288887023926, + "epoch": 7.38, + "learning_rate": 1.3089602704987322e-05, + "loss": 0.533, + "step": 8733, + "task_loss": 0.3537449538707733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5516266822814941, + "epoch": 7.38, + "learning_rate": 1.308537616229924e-05, + "loss": 0.3627, + "step": 8734, + "task_loss": 0.7668045163154602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.369252473115921, + "epoch": 7.38, + "learning_rate": 1.3081149619611159e-05, + "loss": 0.4479, + "step": 8735, + "task_loss": 1.067196249961853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35767072439193726, + "epoch": 7.38, + "learning_rate": 1.3076923076923078e-05, + "loss": 0.4375, + "step": 8736, + "task_loss": 0.20479261875152588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8271821141242981, + "epoch": 7.39, + "learning_rate": 1.3072696534234996e-05, + "loss": 0.5526, + "step": 8737, + "task_loss": 0.7176409363746643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25087279081344604, + "epoch": 7.39, + "learning_rate": 1.3068469991546914e-05, + "loss": 0.3388, + "step": 8738, + "task_loss": 0.287771612405777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37465551495552063, + "epoch": 7.39, + "learning_rate": 1.3064243448858834e-05, + "loss": 0.4188, + "step": 8739, + "task_loss": 0.20908790826797485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22373594343662262, + "epoch": 7.39, + "learning_rate": 1.3060016906170752e-05, + "loss": 0.3345, + "step": 8740, + "task_loss": 0.40413469076156616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3162654936313629, + "epoch": 7.39, + "learning_rate": 1.3055790363482674e-05, + "loss": 0.38, + "step": 8741, + "task_loss": 0.2968984544277191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4183715283870697, + "epoch": 7.39, + "learning_rate": 1.305156382079459e-05, + "loss": 0.3276, + "step": 8742, + "task_loss": 0.41445067524909973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3937259316444397, + "epoch": 7.39, + "learning_rate": 1.3047337278106508e-05, + "loss": 0.397, + "step": 8743, + "task_loss": 0.616841197013855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32108646631240845, + "epoch": 7.39, + "learning_rate": 1.304311073541843e-05, + "loss": 0.4189, + "step": 8744, + "task_loss": 0.1801363229751587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2981376647949219, + "epoch": 7.39, + "learning_rate": 1.3038884192730347e-05, + "loss": 0.5132, + "step": 8745, + "task_loss": 1.2172858715057373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37557950615882874, + "epoch": 7.39, + "learning_rate": 1.3034657650042267e-05, + "loss": 0.3243, + "step": 8746, + "task_loss": 0.0847000852227211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4972626268863678, + "epoch": 7.39, + "learning_rate": 1.3030431107354185e-05, + "loss": 0.3945, + "step": 8747, + "task_loss": 1.1767926216125488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23472389578819275, + "epoch": 7.39, + "learning_rate": 1.3026204564666103e-05, + "loss": 0.3502, + "step": 8748, + "task_loss": 0.720318615436554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32289016246795654, + "epoch": 7.4, + "learning_rate": 1.3021978021978023e-05, + "loss": 0.3628, + "step": 8749, + "task_loss": 0.6835820078849792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2789159119129181, + "epoch": 7.4, + "learning_rate": 1.3017751479289941e-05, + "loss": 0.32, + "step": 8750, + "task_loss": 0.04337075352668762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5624732971191406, + "epoch": 7.4, + "learning_rate": 1.3013524936601859e-05, + "loss": 0.4605, + "step": 8751, + "task_loss": 0.2518502175807953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37176501750946045, + "epoch": 7.4, + "learning_rate": 1.300929839391378e-05, + "loss": 0.3485, + "step": 8752, + "task_loss": 0.25776785612106323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.595321774482727, + "epoch": 7.4, + "learning_rate": 1.3005071851225697e-05, + "loss": 0.4822, + "step": 8753, + "task_loss": 0.5814257860183716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.369944304227829, + "epoch": 7.4, + "learning_rate": 1.3000845308537618e-05, + "loss": 0.3418, + "step": 8754, + "task_loss": 0.6023995876312256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25749728083610535, + "epoch": 7.4, + "learning_rate": 1.2996618765849536e-05, + "loss": 0.3612, + "step": 8755, + "task_loss": 0.2631537616252899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3775484561920166, + "epoch": 7.4, + "learning_rate": 1.2992392223161454e-05, + "loss": 0.4429, + "step": 8756, + "task_loss": 0.16342660784721375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28732916712760925, + "epoch": 7.4, + "learning_rate": 1.2988165680473374e-05, + "loss": 0.3413, + "step": 8757, + "task_loss": 0.3057233691215515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38720449805259705, + "epoch": 7.4, + "learning_rate": 1.2983939137785292e-05, + "loss": 0.4612, + "step": 8758, + "task_loss": 0.8248981833457947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8441869020462036, + "epoch": 7.4, + "learning_rate": 1.297971259509721e-05, + "loss": 0.5863, + "step": 8759, + "task_loss": 2.0191593170166016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5722415447235107, + "epoch": 7.4, + "learning_rate": 1.297548605240913e-05, + "loss": 0.5337, + "step": 8760, + "task_loss": 1.232725977897644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.56423020362854, + "epoch": 7.41, + "learning_rate": 1.2971259509721048e-05, + "loss": 0.4796, + "step": 8761, + "task_loss": 0.38701876997947693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5151432156562805, + "epoch": 7.41, + "learning_rate": 1.296703296703297e-05, + "loss": 0.5124, + "step": 8762, + "task_loss": 0.8187679052352905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3470328450202942, + "epoch": 7.41, + "learning_rate": 1.2962806424344886e-05, + "loss": 0.3222, + "step": 8763, + "task_loss": 0.542992115020752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3804287910461426, + "epoch": 7.41, + "learning_rate": 1.2958579881656804e-05, + "loss": 0.3741, + "step": 8764, + "task_loss": 0.40430596470832825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.594031572341919, + "epoch": 7.41, + "learning_rate": 1.2954353338968725e-05, + "loss": 0.5291, + "step": 8765, + "task_loss": 1.3874036073684692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43327775597572327, + "epoch": 7.41, + "learning_rate": 1.2950126796280643e-05, + "loss": 0.4521, + "step": 8766, + "task_loss": 0.6687437295913696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5368461012840271, + "epoch": 7.41, + "learning_rate": 1.2945900253592561e-05, + "loss": 0.4776, + "step": 8767, + "task_loss": 0.9040844440460205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2301350235939026, + "epoch": 7.41, + "learning_rate": 1.2941673710904481e-05, + "loss": 0.5068, + "step": 8768, + "task_loss": 0.5706980228424072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35703396797180176, + "epoch": 7.41, + "learning_rate": 1.2937447168216399e-05, + "loss": 0.4162, + "step": 8769, + "task_loss": 0.025224963203072548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34440574049949646, + "epoch": 7.41, + "learning_rate": 1.2933220625528319e-05, + "loss": 0.4272, + "step": 8770, + "task_loss": 0.5785366296768188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3016105592250824, + "epoch": 7.41, + "learning_rate": 1.2928994082840237e-05, + "loss": 0.3668, + "step": 8771, + "task_loss": 0.16697748005390167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.546068549156189, + "epoch": 7.41, + "learning_rate": 1.2924767540152155e-05, + "loss": 0.4393, + "step": 8772, + "task_loss": 0.6965318322181702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49050113558769226, + "epoch": 7.42, + "learning_rate": 1.2920540997464076e-05, + "loss": 0.43, + "step": 8773, + "task_loss": 0.8907085061073303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2174573838710785, + "epoch": 7.42, + "learning_rate": 1.2916314454775993e-05, + "loss": 0.2989, + "step": 8774, + "task_loss": 0.2977360785007477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5348559617996216, + "epoch": 7.42, + "learning_rate": 1.2912087912087914e-05, + "loss": 0.4185, + "step": 8775, + "task_loss": 0.7671751379966736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4547441899776459, + "epoch": 7.42, + "learning_rate": 1.2907861369399832e-05, + "loss": 0.6148, + "step": 8776, + "task_loss": 1.0726524591445923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2778957188129425, + "epoch": 7.42, + "learning_rate": 1.290363482671175e-05, + "loss": 0.4236, + "step": 8777, + "task_loss": 0.286037802696228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3593287765979767, + "epoch": 7.42, + "learning_rate": 1.289940828402367e-05, + "loss": 0.4359, + "step": 8778, + "task_loss": 0.5764700174331665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5996150374412537, + "epoch": 7.42, + "learning_rate": 1.2895181741335588e-05, + "loss": 0.3847, + "step": 8779, + "task_loss": 0.3056407868862152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37883490324020386, + "epoch": 7.42, + "learning_rate": 1.2890955198647506e-05, + "loss": 0.3708, + "step": 8780, + "task_loss": 0.48093315958976746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26571112871170044, + "epoch": 7.42, + "learning_rate": 1.2886728655959426e-05, + "loss": 0.4002, + "step": 8781, + "task_loss": 0.21415020525455475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5493648052215576, + "epoch": 7.42, + "learning_rate": 1.2882502113271344e-05, + "loss": 0.3672, + "step": 8782, + "task_loss": 0.5265618562698364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.13616833090782166, + "epoch": 7.42, + "learning_rate": 1.2878275570583265e-05, + "loss": 0.3355, + "step": 8783, + "task_loss": 0.254390686750412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3377780020236969, + "epoch": 7.42, + "learning_rate": 1.2874049027895183e-05, + "loss": 0.3434, + "step": 8784, + "task_loss": 0.36878496408462524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3592544198036194, + "epoch": 7.43, + "learning_rate": 1.28698224852071e-05, + "loss": 0.5211, + "step": 8785, + "task_loss": 0.7577958106994629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36772942543029785, + "epoch": 7.43, + "learning_rate": 1.2865595942519021e-05, + "loss": 0.3931, + "step": 8786, + "task_loss": 0.6353535652160645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43884795904159546, + "epoch": 7.43, + "learning_rate": 1.286136939983094e-05, + "loss": 0.3548, + "step": 8787, + "task_loss": 0.7720987796783447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39639517664909363, + "epoch": 7.43, + "learning_rate": 1.2857142857142857e-05, + "loss": 0.3646, + "step": 8788, + "task_loss": 0.5395711064338684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45970532298088074, + "epoch": 7.43, + "learning_rate": 1.2852916314454777e-05, + "loss": 0.3565, + "step": 8789, + "task_loss": 0.3985978960990906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2692222595214844, + "epoch": 7.43, + "learning_rate": 1.2848689771766695e-05, + "loss": 0.4606, + "step": 8790, + "task_loss": 0.7749914526939392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35242217779159546, + "epoch": 7.43, + "learning_rate": 1.2844463229078615e-05, + "loss": 0.3378, + "step": 8791, + "task_loss": 0.19448921084403992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6296912431716919, + "epoch": 7.43, + "learning_rate": 1.2840236686390533e-05, + "loss": 0.4708, + "step": 8792, + "task_loss": 0.9199957847595215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4764683246612549, + "epoch": 7.43, + "learning_rate": 1.283601014370245e-05, + "loss": 0.391, + "step": 8793, + "task_loss": 0.3603833019733429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5886640548706055, + "epoch": 7.43, + "learning_rate": 1.2831783601014372e-05, + "loss": 0.4655, + "step": 8794, + "task_loss": 0.6643750667572021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4680676758289337, + "epoch": 7.43, + "learning_rate": 1.2827557058326289e-05, + "loss": 0.4574, + "step": 8795, + "task_loss": 0.18839330971240997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43024617433547974, + "epoch": 7.44, + "learning_rate": 1.2823330515638207e-05, + "loss": 0.4287, + "step": 8796, + "task_loss": 0.22537106275558472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3115088939666748, + "epoch": 7.44, + "learning_rate": 1.2819103972950128e-05, + "loss": 0.4744, + "step": 8797, + "task_loss": 0.03869803249835968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22779077291488647, + "epoch": 7.44, + "learning_rate": 1.2814877430262046e-05, + "loss": 0.3737, + "step": 8798, + "task_loss": 0.024479741230607033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41871362924575806, + "epoch": 7.44, + "learning_rate": 1.2810650887573966e-05, + "loss": 0.4318, + "step": 8799, + "task_loss": 0.909135103225708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.264763206243515, + "epoch": 7.44, + "learning_rate": 1.2806424344885884e-05, + "loss": 0.4253, + "step": 8800, + "task_loss": 0.534777820110321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43819350004196167, + "epoch": 7.44, + "learning_rate": 1.2802197802197802e-05, + "loss": 0.4825, + "step": 8801, + "task_loss": 0.6781547665596008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3064551055431366, + "epoch": 7.44, + "learning_rate": 1.2797971259509722e-05, + "loss": 0.5006, + "step": 8802, + "task_loss": 0.8987891674041748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49585264921188354, + "epoch": 7.44, + "learning_rate": 1.279374471682164e-05, + "loss": 0.4279, + "step": 8803, + "task_loss": 0.702390730381012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3444201648235321, + "epoch": 7.44, + "learning_rate": 1.2789518174133561e-05, + "loss": 0.454, + "step": 8804, + "task_loss": 0.5378071665763855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18658792972564697, + "epoch": 7.44, + "learning_rate": 1.278529163144548e-05, + "loss": 0.2447, + "step": 8805, + "task_loss": 0.20429472625255585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3701540529727936, + "epoch": 7.44, + "learning_rate": 1.2781065088757396e-05, + "loss": 0.4203, + "step": 8806, + "task_loss": 0.181137815117836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6214600801467896, + "epoch": 7.44, + "learning_rate": 1.2776838546069317e-05, + "loss": 0.4063, + "step": 8807, + "task_loss": 0.6276839375495911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3892475366592407, + "epoch": 7.45, + "learning_rate": 1.2772612003381235e-05, + "loss": 0.4763, + "step": 8808, + "task_loss": 0.9633103013038635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5496237277984619, + "epoch": 7.45, + "learning_rate": 1.2768385460693153e-05, + "loss": 0.4863, + "step": 8809, + "task_loss": 0.7736993432044983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42636626958847046, + "epoch": 7.45, + "learning_rate": 1.2764158918005073e-05, + "loss": 0.3829, + "step": 8810, + "task_loss": 0.12462669610977173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20963656902313232, + "epoch": 7.45, + "learning_rate": 1.2759932375316991e-05, + "loss": 0.3667, + "step": 8811, + "task_loss": 0.22267165780067444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33875682950019836, + "epoch": 7.45, + "learning_rate": 1.275570583262891e-05, + "loss": 0.3839, + "step": 8812, + "task_loss": 0.4067496657371521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42679429054260254, + "epoch": 7.45, + "learning_rate": 1.2751479289940829e-05, + "loss": 0.4585, + "step": 8813, + "task_loss": 0.7633543014526367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24640582501888275, + "epoch": 7.45, + "learning_rate": 1.2747252747252747e-05, + "loss": 0.3258, + "step": 8814, + "task_loss": 0.1258978396654129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14239676296710968, + "epoch": 7.45, + "learning_rate": 1.2743026204564668e-05, + "loss": 0.3289, + "step": 8815, + "task_loss": 0.489003449678421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2638423442840576, + "epoch": 7.45, + "learning_rate": 1.2738799661876586e-05, + "loss": 0.471, + "step": 8816, + "task_loss": 0.52748703956604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39062976837158203, + "epoch": 7.45, + "learning_rate": 1.2734573119188503e-05, + "loss": 0.3689, + "step": 8817, + "task_loss": 0.21361885964870453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6631650924682617, + "epoch": 7.45, + "learning_rate": 1.2730346576500424e-05, + "loss": 0.446, + "step": 8818, + "task_loss": 1.7960970401763916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6322839856147766, + "epoch": 7.45, + "learning_rate": 1.2726120033812342e-05, + "loss": 0.5278, + "step": 8819, + "task_loss": 0.6124684810638428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33605310320854187, + "epoch": 7.46, + "learning_rate": 1.2721893491124262e-05, + "loss": 0.3552, + "step": 8820, + "task_loss": 0.2629280090332031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30692028999328613, + "epoch": 7.46, + "learning_rate": 1.271766694843618e-05, + "loss": 0.4591, + "step": 8821, + "task_loss": 0.17651055753231049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42437222599983215, + "epoch": 7.46, + "learning_rate": 1.2713440405748098e-05, + "loss": 0.399, + "step": 8822, + "task_loss": 0.9975764155387878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4372026026248932, + "epoch": 7.46, + "learning_rate": 1.2709213863060018e-05, + "loss": 0.3536, + "step": 8823, + "task_loss": 0.36033856868743896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3313635587692261, + "epoch": 7.46, + "learning_rate": 1.2704987320371936e-05, + "loss": 0.4431, + "step": 8824, + "task_loss": 0.44853127002716064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20918361842632294, + "epoch": 7.46, + "learning_rate": 1.2700760777683854e-05, + "loss": 0.3561, + "step": 8825, + "task_loss": 0.2294342964887619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4094730019569397, + "epoch": 7.46, + "learning_rate": 1.2696534234995775e-05, + "loss": 0.3733, + "step": 8826, + "task_loss": 0.6787793040275574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3235383927822113, + "epoch": 7.46, + "learning_rate": 1.2692307692307691e-05, + "loss": 0.5037, + "step": 8827, + "task_loss": 0.9295744299888611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6724331378936768, + "epoch": 7.46, + "learning_rate": 1.2688081149619613e-05, + "loss": 0.448, + "step": 8828, + "task_loss": 0.9057302474975586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4415440261363983, + "epoch": 7.46, + "learning_rate": 1.2683854606931531e-05, + "loss": 0.4352, + "step": 8829, + "task_loss": 0.4219878315925598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34139037132263184, + "epoch": 7.46, + "learning_rate": 1.2679628064243449e-05, + "loss": 0.3915, + "step": 8830, + "task_loss": 1.065568208694458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4358985424041748, + "epoch": 7.46, + "learning_rate": 1.2675401521555369e-05, + "loss": 0.442, + "step": 8831, + "task_loss": 0.8653429746627808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1320231854915619, + "epoch": 7.47, + "learning_rate": 1.2671174978867287e-05, + "loss": 0.3247, + "step": 8832, + "task_loss": 0.27086687088012695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4135437607765198, + "epoch": 7.47, + "learning_rate": 1.2666948436179206e-05, + "loss": 0.3872, + "step": 8833, + "task_loss": 0.8181164264678955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3609418570995331, + "epoch": 7.47, + "learning_rate": 1.2662721893491125e-05, + "loss": 0.404, + "step": 8834, + "task_loss": 0.21947786211967468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2378140240907669, + "epoch": 7.47, + "learning_rate": 1.2658495350803043e-05, + "loss": 0.4053, + "step": 8835, + "task_loss": 0.5357357859611511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2541757822036743, + "epoch": 7.47, + "learning_rate": 1.2654268808114964e-05, + "loss": 0.3689, + "step": 8836, + "task_loss": 0.9392150640487671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6929994225502014, + "epoch": 7.47, + "learning_rate": 1.2650042265426882e-05, + "loss": 0.4424, + "step": 8837, + "task_loss": 0.3312847912311554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46591809391975403, + "epoch": 7.47, + "learning_rate": 1.2645815722738798e-05, + "loss": 0.3522, + "step": 8838, + "task_loss": 0.20715416967868805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2465772032737732, + "epoch": 7.47, + "learning_rate": 1.264158918005072e-05, + "loss": 0.3597, + "step": 8839, + "task_loss": 0.3812654912471771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4125104546546936, + "epoch": 7.47, + "learning_rate": 1.2637362637362638e-05, + "loss": 0.3281, + "step": 8840, + "task_loss": 0.11148248612880707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35475894808769226, + "epoch": 7.47, + "learning_rate": 1.2633136094674558e-05, + "loss": 0.3713, + "step": 8841, + "task_loss": 1.0705935955047607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.612317681312561, + "epoch": 7.47, + "learning_rate": 1.2628909551986476e-05, + "loss": 0.4326, + "step": 8842, + "task_loss": 1.7060391902923584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31393200159072876, + "epoch": 7.47, + "learning_rate": 1.2624683009298394e-05, + "loss": 0.4207, + "step": 8843, + "task_loss": 0.47597038745880127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29434478282928467, + "epoch": 7.48, + "learning_rate": 1.2620456466610313e-05, + "loss": 0.4784, + "step": 8844, + "task_loss": 0.6927053332328796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2564399838447571, + "epoch": 7.48, + "learning_rate": 1.2616229923922232e-05, + "loss": 0.503, + "step": 8845, + "task_loss": 0.15902084112167358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.12941963970661163, + "epoch": 7.48, + "learning_rate": 1.261200338123415e-05, + "loss": 0.3658, + "step": 8846, + "task_loss": 0.13869743049144745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29704955220222473, + "epoch": 7.48, + "learning_rate": 1.2607776838546071e-05, + "loss": 0.4081, + "step": 8847, + "task_loss": 0.7093079686164856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23696014285087585, + "epoch": 7.48, + "learning_rate": 1.2603550295857989e-05, + "loss": 0.4511, + "step": 8848, + "task_loss": 0.36203086376190186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17028087377548218, + "epoch": 7.48, + "learning_rate": 1.2599323753169909e-05, + "loss": 0.3792, + "step": 8849, + "task_loss": 0.52455073595047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2588842809200287, + "epoch": 7.48, + "learning_rate": 1.2595097210481827e-05, + "loss": 0.3485, + "step": 8850, + "task_loss": 0.4023149013519287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4107217788696289, + "epoch": 7.48, + "learning_rate": 1.2590870667793745e-05, + "loss": 0.4252, + "step": 8851, + "task_loss": 0.4958897829055786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4180072546005249, + "epoch": 7.48, + "learning_rate": 1.2586644125105665e-05, + "loss": 0.3085, + "step": 8852, + "task_loss": 0.48024940490722656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4177292585372925, + "epoch": 7.48, + "learning_rate": 1.2582417582417583e-05, + "loss": 0.4258, + "step": 8853, + "task_loss": 0.28809022903442383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2382304072380066, + "epoch": 7.48, + "learning_rate": 1.25781910397295e-05, + "loss": 0.4979, + "step": 8854, + "task_loss": 0.07806060463190079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4962015748023987, + "epoch": 7.48, + "learning_rate": 1.257396449704142e-05, + "loss": 0.4374, + "step": 8855, + "task_loss": 0.49213072657585144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4624560475349426, + "epoch": 7.49, + "learning_rate": 1.2569737954353338e-05, + "loss": 0.3953, + "step": 8856, + "task_loss": 0.775689423084259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34697291254997253, + "epoch": 7.49, + "learning_rate": 1.256551141166526e-05, + "loss": 0.3447, + "step": 8857, + "task_loss": 0.8250847458839417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1905665546655655, + "epoch": 7.49, + "learning_rate": 1.2561284868977178e-05, + "loss": 0.3133, + "step": 8858, + "task_loss": 0.8301593661308289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2887338697910309, + "epoch": 7.49, + "learning_rate": 1.2557058326289094e-05, + "loss": 0.3094, + "step": 8859, + "task_loss": 0.3718477487564087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3488732874393463, + "epoch": 7.49, + "learning_rate": 1.2552831783601016e-05, + "loss": 0.4544, + "step": 8860, + "task_loss": 0.8264468908309937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35679537057876587, + "epoch": 7.49, + "learning_rate": 1.2548605240912934e-05, + "loss": 0.2933, + "step": 8861, + "task_loss": 0.32985883951187134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6441663503646851, + "epoch": 7.49, + "learning_rate": 1.2544378698224854e-05, + "loss": 0.4791, + "step": 8862, + "task_loss": 1.0300378799438477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2912566065788269, + "epoch": 7.49, + "learning_rate": 1.2540152155536772e-05, + "loss": 0.3663, + "step": 8863, + "task_loss": 0.6975594758987427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24026711285114288, + "epoch": 7.49, + "learning_rate": 1.253592561284869e-05, + "loss": 0.2731, + "step": 8864, + "task_loss": 0.6998701095581055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.319728285074234, + "epoch": 7.49, + "learning_rate": 1.253169907016061e-05, + "loss": 0.3631, + "step": 8865, + "task_loss": 0.7206800580024719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2280956655740738, + "epoch": 7.49, + "learning_rate": 1.2527472527472527e-05, + "loss": 0.4835, + "step": 8866, + "task_loss": 1.1404627561569214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2612929344177246, + "epoch": 7.5, + "learning_rate": 1.2523245984784445e-05, + "loss": 0.3278, + "step": 8867, + "task_loss": 0.13966603577136993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.288677453994751, + "epoch": 7.5, + "learning_rate": 1.2519019442096367e-05, + "loss": 0.4266, + "step": 8868, + "task_loss": 0.8758043050765991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22271905839443207, + "epoch": 7.5, + "learning_rate": 1.2514792899408285e-05, + "loss": 0.3702, + "step": 8869, + "task_loss": 0.10380499064922333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25997495651245117, + "epoch": 7.5, + "learning_rate": 1.2510566356720205e-05, + "loss": 0.4088, + "step": 8870, + "task_loss": 0.3332921266555786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34686386585235596, + "epoch": 7.5, + "learning_rate": 1.2506339814032123e-05, + "loss": 0.379, + "step": 8871, + "task_loss": 0.42655396461486816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4964175820350647, + "epoch": 7.5, + "learning_rate": 1.250211327134404e-05, + "loss": 0.3086, + "step": 8872, + "task_loss": 0.30568987131118774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3937375545501709, + "epoch": 7.5, + "learning_rate": 1.2497886728655959e-05, + "loss": 0.31, + "step": 8873, + "task_loss": 0.5503883957862854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23620827496051788, + "epoch": 7.5, + "learning_rate": 1.2493660185967879e-05, + "loss": 0.4223, + "step": 8874, + "task_loss": 0.5142781138420105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2105209231376648, + "epoch": 7.5, + "learning_rate": 1.2489433643279798e-05, + "loss": 0.3584, + "step": 8875, + "task_loss": 0.06056657060980797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33140671253204346, + "epoch": 7.5, + "learning_rate": 1.2485207100591716e-05, + "loss": 0.378, + "step": 8876, + "task_loss": 0.28246891498565674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33876216411590576, + "epoch": 7.5, + "learning_rate": 1.2480980557903634e-05, + "loss": 0.4594, + "step": 8877, + "task_loss": 0.43334996700286865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30420565605163574, + "epoch": 7.5, + "learning_rate": 1.2476754015215554e-05, + "loss": 0.2899, + "step": 8878, + "task_loss": 0.11611993610858917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40651336312294006, + "epoch": 7.51, + "learning_rate": 1.2472527472527474e-05, + "loss": 0.4605, + "step": 8879, + "task_loss": 0.0965336337685585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3722517788410187, + "epoch": 7.51, + "learning_rate": 1.2468300929839392e-05, + "loss": 0.3537, + "step": 8880, + "task_loss": 0.7548536062240601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5767618417739868, + "epoch": 7.51, + "learning_rate": 1.246407438715131e-05, + "loss": 0.3653, + "step": 8881, + "task_loss": 0.6869463324546814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24362243711948395, + "epoch": 7.51, + "learning_rate": 1.245984784446323e-05, + "loss": 0.5005, + "step": 8882, + "task_loss": 1.1425288915634155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5760525465011597, + "epoch": 7.51, + "learning_rate": 1.245562130177515e-05, + "loss": 0.4056, + "step": 8883, + "task_loss": 0.9298889636993408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3821551501750946, + "epoch": 7.51, + "learning_rate": 1.2451394759087067e-05, + "loss": 0.4632, + "step": 8884, + "task_loss": 1.2720098495483398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24660804867744446, + "epoch": 7.51, + "learning_rate": 1.2447168216398985e-05, + "loss": 0.412, + "step": 8885, + "task_loss": 0.6145820021629333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18818311393260956, + "epoch": 7.51, + "learning_rate": 1.2442941673710905e-05, + "loss": 0.3328, + "step": 8886, + "task_loss": 0.4876258969306946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21004462242126465, + "epoch": 7.51, + "learning_rate": 1.2438715131022823e-05, + "loss": 0.3378, + "step": 8887, + "task_loss": 0.4048168361186981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.434485524892807, + "epoch": 7.51, + "learning_rate": 1.2434488588334743e-05, + "loss": 0.4139, + "step": 8888, + "task_loss": 1.3127778768539429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16074581444263458, + "epoch": 7.51, + "learning_rate": 1.2430262045646663e-05, + "loss": 0.3653, + "step": 8889, + "task_loss": 0.9348604679107666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37428492307662964, + "epoch": 7.51, + "learning_rate": 1.242603550295858e-05, + "loss": 0.4064, + "step": 8890, + "task_loss": 0.23023812472820282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29853618144989014, + "epoch": 7.52, + "learning_rate": 1.2421808960270499e-05, + "loss": 0.3676, + "step": 8891, + "task_loss": 0.26354339718818665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23341627418994904, + "epoch": 7.52, + "learning_rate": 1.2417582417582419e-05, + "loss": 0.2787, + "step": 8892, + "task_loss": 0.0650770515203476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3779405653476715, + "epoch": 7.52, + "learning_rate": 1.2413355874894338e-05, + "loss": 0.3873, + "step": 8893, + "task_loss": 0.2642820477485657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5107924938201904, + "epoch": 7.52, + "learning_rate": 1.2409129332206255e-05, + "loss": 0.3938, + "step": 8894, + "task_loss": 0.9316972494125366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2726571261882782, + "epoch": 7.52, + "learning_rate": 1.2404902789518174e-05, + "loss": 0.441, + "step": 8895, + "task_loss": 0.26469066739082336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24930131435394287, + "epoch": 7.52, + "learning_rate": 1.2400676246830094e-05, + "loss": 0.4049, + "step": 8896, + "task_loss": 0.5534136295318604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4573225975036621, + "epoch": 7.52, + "learning_rate": 1.2396449704142012e-05, + "loss": 0.3932, + "step": 8897, + "task_loss": 0.643497884273529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6484670042991638, + "epoch": 7.52, + "learning_rate": 1.239222316145393e-05, + "loss": 0.3766, + "step": 8898, + "task_loss": 1.8177398443222046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6824172735214233, + "epoch": 7.52, + "learning_rate": 1.238799661876585e-05, + "loss": 0.5384, + "step": 8899, + "task_loss": 1.3811885118484497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37835413217544556, + "epoch": 7.52, + "learning_rate": 1.238377007607777e-05, + "loss": 0.3007, + "step": 8900, + "task_loss": 0.9385674595832825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28758400678634644, + "epoch": 7.52, + "learning_rate": 1.2379543533389688e-05, + "loss": 0.3362, + "step": 8901, + "task_loss": 0.20636217296123505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4175105690956116, + "epoch": 7.52, + "learning_rate": 1.2375316990701606e-05, + "loss": 0.4085, + "step": 8902, + "task_loss": 0.4218672215938568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31978416442871094, + "epoch": 7.53, + "learning_rate": 1.2371090448013526e-05, + "loss": 0.3666, + "step": 8903, + "task_loss": 0.33239826560020447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23870491981506348, + "epoch": 7.53, + "learning_rate": 1.2366863905325445e-05, + "loss": 0.359, + "step": 8904, + "task_loss": 0.8117021918296814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3809123635292053, + "epoch": 7.53, + "learning_rate": 1.2362637362637363e-05, + "loss": 0.405, + "step": 8905, + "task_loss": 0.6673400402069092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5134103298187256, + "epoch": 7.53, + "learning_rate": 1.2358410819949281e-05, + "loss": 0.4659, + "step": 8906, + "task_loss": 0.5493857860565186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36607301235198975, + "epoch": 7.53, + "learning_rate": 1.2354184277261201e-05, + "loss": 0.4395, + "step": 8907, + "task_loss": 0.7062835693359375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.522894561290741, + "epoch": 7.53, + "learning_rate": 1.2349957734573119e-05, + "loss": 0.4391, + "step": 8908, + "task_loss": 0.36490681767463684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2178795486688614, + "epoch": 7.53, + "learning_rate": 1.2345731191885039e-05, + "loss": 0.3607, + "step": 8909, + "task_loss": 0.2941228151321411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38187286257743835, + "epoch": 7.53, + "learning_rate": 1.2341504649196957e-05, + "loss": 0.3477, + "step": 8910, + "task_loss": 0.275736004114151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5548045635223389, + "epoch": 7.53, + "learning_rate": 1.2337278106508877e-05, + "loss": 0.4364, + "step": 8911, + "task_loss": 0.3776021897792816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32947251200675964, + "epoch": 7.53, + "learning_rate": 1.2333051563820795e-05, + "loss": 0.5436, + "step": 8912, + "task_loss": 0.7478124499320984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49149489402770996, + "epoch": 7.53, + "learning_rate": 1.2328825021132714e-05, + "loss": 0.4001, + "step": 8913, + "task_loss": 0.4371243119239807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2830374836921692, + "epoch": 7.53, + "learning_rate": 1.2324598478444632e-05, + "loss": 0.4831, + "step": 8914, + "task_loss": 1.4057505130767822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31978940963745117, + "epoch": 7.54, + "learning_rate": 1.2320371935756552e-05, + "loss": 0.4154, + "step": 8915, + "task_loss": 0.9829380512237549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5351676344871521, + "epoch": 7.54, + "learning_rate": 1.231614539306847e-05, + "loss": 0.3455, + "step": 8916, + "task_loss": 0.18762677907943726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.474325031042099, + "epoch": 7.54, + "learning_rate": 1.231191885038039e-05, + "loss": 0.4569, + "step": 8917, + "task_loss": 0.8255453109741211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3540608882904053, + "epoch": 7.54, + "learning_rate": 1.230769230769231e-05, + "loss": 0.4666, + "step": 8918, + "task_loss": 0.8022740483283997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33126741647720337, + "epoch": 7.54, + "learning_rate": 1.2303465765004226e-05, + "loss": 0.4486, + "step": 8919, + "task_loss": 1.1576017141342163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6492169499397278, + "epoch": 7.54, + "learning_rate": 1.2299239222316146e-05, + "loss": 0.4494, + "step": 8920, + "task_loss": 1.002586841583252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5384097695350647, + "epoch": 7.54, + "learning_rate": 1.2295012679628066e-05, + "loss": 0.5098, + "step": 8921, + "task_loss": 0.6145704388618469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32398903369903564, + "epoch": 7.54, + "learning_rate": 1.2290786136939984e-05, + "loss": 0.3572, + "step": 8922, + "task_loss": 0.7566264867782593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2882049083709717, + "epoch": 7.54, + "learning_rate": 1.2286559594251902e-05, + "loss": 0.2982, + "step": 8923, + "task_loss": 0.08466240763664246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.11329734325408936, + "epoch": 7.54, + "learning_rate": 1.2282333051563821e-05, + "loss": 0.3494, + "step": 8924, + "task_loss": 0.03128701075911522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35052669048309326, + "epoch": 7.54, + "learning_rate": 1.2278106508875741e-05, + "loss": 0.3503, + "step": 8925, + "task_loss": 1.2916423082351685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42401400208473206, + "epoch": 7.54, + "learning_rate": 1.227387996618766e-05, + "loss": 0.3693, + "step": 8926, + "task_loss": 1.1733742952346802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28700000047683716, + "epoch": 7.55, + "learning_rate": 1.2269653423499577e-05, + "loss": 0.4651, + "step": 8927, + "task_loss": 0.7809203863143921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3982163667678833, + "epoch": 7.55, + "learning_rate": 1.2265426880811497e-05, + "loss": 0.3856, + "step": 8928, + "task_loss": 0.2948783338069916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4382515549659729, + "epoch": 7.55, + "learning_rate": 1.2261200338123415e-05, + "loss": 0.3934, + "step": 8929, + "task_loss": 0.7454134225845337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6083048582077026, + "epoch": 7.55, + "learning_rate": 1.2256973795435335e-05, + "loss": 0.47, + "step": 8930, + "task_loss": 0.6318076252937317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2604491710662842, + "epoch": 7.55, + "learning_rate": 1.2252747252747253e-05, + "loss": 0.3542, + "step": 8931, + "task_loss": 0.828550398349762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25900179147720337, + "epoch": 7.55, + "learning_rate": 1.2248520710059173e-05, + "loss": 0.3333, + "step": 8932, + "task_loss": 0.3528364598751068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43939876556396484, + "epoch": 7.55, + "learning_rate": 1.224429416737109e-05, + "loss": 0.38, + "step": 8933, + "task_loss": 0.5450295805931091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4828951954841614, + "epoch": 7.55, + "learning_rate": 1.224006762468301e-05, + "loss": 0.361, + "step": 8934, + "task_loss": 1.1304030418395996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.623633623123169, + "epoch": 7.55, + "learning_rate": 1.2235841081994928e-05, + "loss": 0.4303, + "step": 8935, + "task_loss": 0.46177998185157776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17230787873268127, + "epoch": 7.55, + "learning_rate": 1.2231614539306848e-05, + "loss": 0.3842, + "step": 8936, + "task_loss": 0.331039160490036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22837987542152405, + "epoch": 7.55, + "learning_rate": 1.2227387996618766e-05, + "loss": 0.3194, + "step": 8937, + "task_loss": 0.1575840413570404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3448342978954315, + "epoch": 7.56, + "learning_rate": 1.2223161453930686e-05, + "loss": 0.4067, + "step": 8938, + "task_loss": 0.20840643346309662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36390653252601624, + "epoch": 7.56, + "learning_rate": 1.2218934911242604e-05, + "loss": 0.3667, + "step": 8939, + "task_loss": 0.2343478500843048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25411197543144226, + "epoch": 7.56, + "learning_rate": 1.2214708368554522e-05, + "loss": 0.3227, + "step": 8940, + "task_loss": 0.7479848861694336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1864277422428131, + "epoch": 7.56, + "learning_rate": 1.2210481825866442e-05, + "loss": 0.3126, + "step": 8941, + "task_loss": 0.5073248147964478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4902167320251465, + "epoch": 7.56, + "learning_rate": 1.2206255283178361e-05, + "loss": 0.4038, + "step": 8942, + "task_loss": 0.6532389521598816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6410514116287231, + "epoch": 7.56, + "learning_rate": 1.220202874049028e-05, + "loss": 0.3922, + "step": 8943, + "task_loss": 1.1557260751724243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4287768304347992, + "epoch": 7.56, + "learning_rate": 1.2197802197802198e-05, + "loss": 0.4335, + "step": 8944, + "task_loss": 1.068378210067749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34037670493125916, + "epoch": 7.56, + "learning_rate": 1.2193575655114117e-05, + "loss": 0.3926, + "step": 8945, + "task_loss": 0.5771277546882629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3525254726409912, + "epoch": 7.56, + "learning_rate": 1.2189349112426037e-05, + "loss": 0.4133, + "step": 8946, + "task_loss": 1.157086730003357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19959726929664612, + "epoch": 7.56, + "learning_rate": 1.2185122569737955e-05, + "loss": 0.4065, + "step": 8947, + "task_loss": 0.19284602999687195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3066311478614807, + "epoch": 7.56, + "learning_rate": 1.2180896027049873e-05, + "loss": 0.3785, + "step": 8948, + "task_loss": 0.8857675194740295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2674952745437622, + "epoch": 7.56, + "learning_rate": 1.2176669484361793e-05, + "loss": 0.4164, + "step": 8949, + "task_loss": 0.1879102885723114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47725144028663635, + "epoch": 7.57, + "learning_rate": 1.2172442941673713e-05, + "loss": 0.3862, + "step": 8950, + "task_loss": 1.2685786485671997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.424798846244812, + "epoch": 7.57, + "learning_rate": 1.216821639898563e-05, + "loss": 0.4041, + "step": 8951, + "task_loss": 0.7951120138168335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31424546241760254, + "epoch": 7.57, + "learning_rate": 1.2163989856297549e-05, + "loss": 0.3701, + "step": 8952, + "task_loss": 1.26388680934906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46314823627471924, + "epoch": 7.57, + "learning_rate": 1.2159763313609468e-05, + "loss": 0.5968, + "step": 8953, + "task_loss": 1.3174591064453125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4308896064758301, + "epoch": 7.57, + "learning_rate": 1.2155536770921386e-05, + "loss": 0.4049, + "step": 8954, + "task_loss": 1.1437594890594482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3096659779548645, + "epoch": 7.57, + "learning_rate": 1.2151310228233306e-05, + "loss": 0.3068, + "step": 8955, + "task_loss": 0.15330055356025696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3247291147708893, + "epoch": 7.57, + "learning_rate": 1.2147083685545224e-05, + "loss": 0.2858, + "step": 8956, + "task_loss": 0.47926098108291626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38016843795776367, + "epoch": 7.57, + "learning_rate": 1.2142857142857144e-05, + "loss": 0.3266, + "step": 8957, + "task_loss": 0.4028185307979584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48601335287094116, + "epoch": 7.57, + "learning_rate": 1.2138630600169062e-05, + "loss": 0.4343, + "step": 8958, + "task_loss": 0.1442585289478302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3384753167629242, + "epoch": 7.57, + "learning_rate": 1.2134404057480982e-05, + "loss": 0.3032, + "step": 8959, + "task_loss": 0.25352779030799866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44721609354019165, + "epoch": 7.57, + "learning_rate": 1.21301775147929e-05, + "loss": 0.427, + "step": 8960, + "task_loss": 1.0813796520233154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48134347796440125, + "epoch": 7.57, + "learning_rate": 1.2125950972104818e-05, + "loss": 0.3702, + "step": 8961, + "task_loss": 0.17928604781627655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16162718832492828, + "epoch": 7.58, + "learning_rate": 1.2121724429416738e-05, + "loss": 0.349, + "step": 8962, + "task_loss": 0.25479400157928467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32777196168899536, + "epoch": 7.58, + "learning_rate": 1.2117497886728657e-05, + "loss": 0.3983, + "step": 8963, + "task_loss": 0.8173986673355103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5242729783058167, + "epoch": 7.58, + "learning_rate": 1.2113271344040575e-05, + "loss": 0.3049, + "step": 8964, + "task_loss": 0.6155838966369629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40636640787124634, + "epoch": 7.58, + "learning_rate": 1.2109044801352493e-05, + "loss": 0.3511, + "step": 8965, + "task_loss": 0.2831018269062042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.327983021736145, + "epoch": 7.58, + "learning_rate": 1.2104818258664413e-05, + "loss": 0.2996, + "step": 8966, + "task_loss": 0.42353391647338867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3352714776992798, + "epoch": 7.58, + "learning_rate": 1.2100591715976333e-05, + "loss": 0.3305, + "step": 8967, + "task_loss": 0.6858367323875427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5846202969551086, + "epoch": 7.58, + "learning_rate": 1.2096365173288251e-05, + "loss": 0.6059, + "step": 8968, + "task_loss": 0.2644737958908081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3736933469772339, + "epoch": 7.58, + "learning_rate": 1.2092138630600169e-05, + "loss": 0.438, + "step": 8969, + "task_loss": 0.44630762934684753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2074945718050003, + "epoch": 7.58, + "learning_rate": 1.2087912087912089e-05, + "loss": 0.3863, + "step": 8970, + "task_loss": 0.8092791438102722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.319236159324646, + "epoch": 7.58, + "learning_rate": 1.2083685545224008e-05, + "loss": 0.3868, + "step": 8971, + "task_loss": 0.011440849862992764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3765392601490021, + "epoch": 7.58, + "learning_rate": 1.2079459002535925e-05, + "loss": 0.3589, + "step": 8972, + "task_loss": 0.5851409435272217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3749988079071045, + "epoch": 7.58, + "learning_rate": 1.2075232459847845e-05, + "loss": 0.3049, + "step": 8973, + "task_loss": 0.6993555426597595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44657981395721436, + "epoch": 7.59, + "learning_rate": 1.2071005917159764e-05, + "loss": 0.4233, + "step": 8974, + "task_loss": 0.3421289324760437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29481810331344604, + "epoch": 7.59, + "learning_rate": 1.2066779374471682e-05, + "loss": 0.4469, + "step": 8975, + "task_loss": 0.13945460319519043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5465073585510254, + "epoch": 7.59, + "learning_rate": 1.20625528317836e-05, + "loss": 0.5047, + "step": 8976, + "task_loss": 1.0065362453460693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27463531494140625, + "epoch": 7.59, + "learning_rate": 1.205832628909552e-05, + "loss": 0.4049, + "step": 8977, + "task_loss": 0.054988909512758255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33771124482154846, + "epoch": 7.59, + "learning_rate": 1.205409974640744e-05, + "loss": 0.3692, + "step": 8978, + "task_loss": 0.3773528039455414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4338395595550537, + "epoch": 7.59, + "learning_rate": 1.2049873203719358e-05, + "loss": 0.414, + "step": 8979, + "task_loss": 0.6713299751281738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2405247986316681, + "epoch": 7.59, + "learning_rate": 1.2045646661031278e-05, + "loss": 0.3377, + "step": 8980, + "task_loss": 0.35603412985801697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4226876497268677, + "epoch": 7.59, + "learning_rate": 1.2041420118343196e-05, + "loss": 0.3748, + "step": 8981, + "task_loss": 0.38470765948295593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1752716451883316, + "epoch": 7.59, + "learning_rate": 1.2037193575655115e-05, + "loss": 0.3299, + "step": 8982, + "task_loss": 1.0044262409210205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8618215918540955, + "epoch": 7.59, + "learning_rate": 1.2032967032967033e-05, + "loss": 0.4886, + "step": 8983, + "task_loss": 0.511257529258728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5087677836418152, + "epoch": 7.59, + "learning_rate": 1.2028740490278953e-05, + "loss": 0.3417, + "step": 8984, + "task_loss": 0.618377685546875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24110707640647888, + "epoch": 7.59, + "learning_rate": 1.2024513947590871e-05, + "loss": 0.3462, + "step": 8985, + "task_loss": 0.3202146291732788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42054715752601624, + "epoch": 7.6, + "learning_rate": 1.202028740490279e-05, + "loss": 0.4972, + "step": 8986, + "task_loss": 0.19251422584056854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42426180839538574, + "epoch": 7.6, + "learning_rate": 1.2016060862214709e-05, + "loss": 0.4202, + "step": 8987, + "task_loss": 0.45764419436454773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37431126832962036, + "epoch": 7.6, + "learning_rate": 1.2011834319526629e-05, + "loss": 0.4048, + "step": 8988, + "task_loss": 0.4787616729736328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2588847875595093, + "epoch": 7.6, + "learning_rate": 1.2007607776838547e-05, + "loss": 0.4498, + "step": 8989, + "task_loss": 0.3828330934047699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7134664058685303, + "epoch": 7.6, + "learning_rate": 1.2003381234150465e-05, + "loss": 0.475, + "step": 8990, + "task_loss": 1.1319533586502075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4578288793563843, + "epoch": 7.6, + "learning_rate": 1.1999154691462385e-05, + "loss": 0.4477, + "step": 8991, + "task_loss": 0.7399687170982361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2551916241645813, + "epoch": 7.6, + "learning_rate": 1.1994928148774304e-05, + "loss": 0.3173, + "step": 8992, + "task_loss": 0.17864371836185455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2895351052284241, + "epoch": 7.6, + "learning_rate": 1.199070160608622e-05, + "loss": 0.3202, + "step": 8993, + "task_loss": 0.3202516734600067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2535991370677948, + "epoch": 7.6, + "learning_rate": 1.198647506339814e-05, + "loss": 0.3839, + "step": 8994, + "task_loss": 0.30042195320129395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3672291040420532, + "epoch": 7.6, + "learning_rate": 1.198224852071006e-05, + "loss": 0.448, + "step": 8995, + "task_loss": 0.3989412486553192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24742627143859863, + "epoch": 7.6, + "learning_rate": 1.197802197802198e-05, + "loss": 0.3432, + "step": 8996, + "task_loss": 0.3141601085662842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4100652039051056, + "epoch": 7.6, + "learning_rate": 1.1973795435333896e-05, + "loss": 0.4238, + "step": 8997, + "task_loss": 0.10547729581594467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31135597825050354, + "epoch": 7.61, + "learning_rate": 1.1969568892645816e-05, + "loss": 0.2401, + "step": 8998, + "task_loss": 0.39433011412620544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17257516086101532, + "epoch": 7.61, + "learning_rate": 1.1965342349957736e-05, + "loss": 0.4469, + "step": 8999, + "task_loss": 0.3729266822338104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34770286083221436, + "epoch": 7.61, + "learning_rate": 1.1961115807269654e-05, + "loss": 0.4468, + "step": 9000, + "task_loss": 0.03814266622066498 + }, + { + "epoch": 7.61, + "eval_accuracy": 0.9144158415841585, + "eval_loss": 0.2698863446712494, + "eval_runtime": 226.5568, + "eval_samples_per_second": 111.451, + "eval_steps_per_second": 0.874, + "step": 9000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3650602102279663, + "epoch": 7.61, + "learning_rate": 1.1956889264581572e-05, + "loss": 0.3361, + "step": 9001, + "task_loss": 0.6610453128814697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22278177738189697, + "epoch": 7.61, + "learning_rate": 1.1952662721893492e-05, + "loss": 0.3935, + "step": 9002, + "task_loss": 0.08621159195899963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27735745906829834, + "epoch": 7.61, + "learning_rate": 1.1948436179205411e-05, + "loss": 0.3915, + "step": 9003, + "task_loss": 0.27255043387413025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37566742300987244, + "epoch": 7.61, + "learning_rate": 1.194420963651733e-05, + "loss": 0.3423, + "step": 9004, + "task_loss": 0.8501888513565063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1851779818534851, + "epoch": 7.61, + "learning_rate": 1.1939983093829247e-05, + "loss": 0.3247, + "step": 9005, + "task_loss": 0.10457085072994232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24098354578018188, + "epoch": 7.61, + "learning_rate": 1.1935756551141167e-05, + "loss": 0.3528, + "step": 9006, + "task_loss": 0.280144602060318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36806613206863403, + "epoch": 7.61, + "learning_rate": 1.1931530008453085e-05, + "loss": 0.4265, + "step": 9007, + "task_loss": 1.0227279663085938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2922498285770416, + "epoch": 7.61, + "learning_rate": 1.1927303465765005e-05, + "loss": 0.3148, + "step": 9008, + "task_loss": 0.10302627831697464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4662601947784424, + "epoch": 7.61, + "learning_rate": 1.1923076923076925e-05, + "loss": 0.4752, + "step": 9009, + "task_loss": 0.7276266813278198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5095608234405518, + "epoch": 7.62, + "learning_rate": 1.1918850380388843e-05, + "loss": 0.4364, + "step": 9010, + "task_loss": 1.2232692241668701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19969511032104492, + "epoch": 7.62, + "learning_rate": 1.191462383770076e-05, + "loss": 0.3228, + "step": 9011, + "task_loss": 0.759532630443573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3929698169231415, + "epoch": 7.62, + "learning_rate": 1.191039729501268e-05, + "loss": 0.4696, + "step": 9012, + "task_loss": 0.8393535017967224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5342813730239868, + "epoch": 7.62, + "learning_rate": 1.19061707523246e-05, + "loss": 0.4763, + "step": 9013, + "task_loss": 1.1013715267181396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31900882720947266, + "epoch": 7.62, + "learning_rate": 1.1901944209636518e-05, + "loss": 0.3288, + "step": 9014, + "task_loss": 0.5611660480499268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3892659544944763, + "epoch": 7.62, + "learning_rate": 1.1897717666948436e-05, + "loss": 0.4729, + "step": 9015, + "task_loss": 0.9186141490936279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24343809485435486, + "epoch": 7.62, + "learning_rate": 1.1893491124260356e-05, + "loss": 0.3368, + "step": 9016, + "task_loss": 0.6908840537071228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4670860767364502, + "epoch": 7.62, + "learning_rate": 1.1889264581572276e-05, + "loss": 0.5594, + "step": 9017, + "task_loss": 0.04985443130135536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22475585341453552, + "epoch": 7.62, + "learning_rate": 1.1885038038884192e-05, + "loss": 0.3454, + "step": 9018, + "task_loss": 0.09994767606258392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4485097825527191, + "epoch": 7.62, + "learning_rate": 1.1880811496196112e-05, + "loss": 0.3937, + "step": 9019, + "task_loss": 0.843705415725708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5624816417694092, + "epoch": 7.62, + "learning_rate": 1.1876584953508032e-05, + "loss": 0.3209, + "step": 9020, + "task_loss": 0.7311621904373169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4427991509437561, + "epoch": 7.63, + "learning_rate": 1.187235841081995e-05, + "loss": 0.3302, + "step": 9021, + "task_loss": 0.720029890537262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7290709614753723, + "epoch": 7.63, + "learning_rate": 1.1868131868131868e-05, + "loss": 0.4931, + "step": 9022, + "task_loss": 0.9248529672622681 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.593668520450592, + "epoch": 7.63, + "learning_rate": 1.1863905325443787e-05, + "loss": 0.411, + "step": 9023, + "task_loss": 1.2561993598937988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3528909683227539, + "epoch": 7.63, + "learning_rate": 1.1859678782755707e-05, + "loss": 0.4499, + "step": 9024, + "task_loss": 0.6737098693847656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5126925110816956, + "epoch": 7.63, + "learning_rate": 1.1855452240067625e-05, + "loss": 0.5475, + "step": 9025, + "task_loss": 0.6768222451210022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30118900537490845, + "epoch": 7.63, + "learning_rate": 1.1851225697379543e-05, + "loss": 0.3384, + "step": 9026, + "task_loss": 0.09395896643400192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33822476863861084, + "epoch": 7.63, + "learning_rate": 1.1846999154691463e-05, + "loss": 0.4809, + "step": 9027, + "task_loss": 0.4942106008529663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.531981885433197, + "epoch": 7.63, + "learning_rate": 1.1842772612003383e-05, + "loss": 0.5651, + "step": 9028, + "task_loss": 0.6738797426223755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26686498522758484, + "epoch": 7.63, + "learning_rate": 1.18385460693153e-05, + "loss": 0.3826, + "step": 9029, + "task_loss": 0.8414326310157776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5362911224365234, + "epoch": 7.63, + "learning_rate": 1.1834319526627219e-05, + "loss": 0.5541, + "step": 9030, + "task_loss": 1.2536871433258057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24590009450912476, + "epoch": 7.63, + "learning_rate": 1.1830092983939139e-05, + "loss": 0.4244, + "step": 9031, + "task_loss": 0.1383514255285263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4412611722946167, + "epoch": 7.63, + "learning_rate": 1.1825866441251057e-05, + "loss": 0.6291, + "step": 9032, + "task_loss": 0.48700597882270813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4063577950000763, + "epoch": 7.64, + "learning_rate": 1.1821639898562976e-05, + "loss": 0.3208, + "step": 9033, + "task_loss": 0.7193591594696045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20122285187244415, + "epoch": 7.64, + "learning_rate": 1.1817413355874894e-05, + "loss": 0.4108, + "step": 9034, + "task_loss": 0.023574326187372208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41745832562446594, + "epoch": 7.64, + "learning_rate": 1.1813186813186814e-05, + "loss": 0.427, + "step": 9035, + "task_loss": 0.47051501274108887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3117230534553528, + "epoch": 7.64, + "learning_rate": 1.1808960270498732e-05, + "loss": 0.3627, + "step": 9036, + "task_loss": 0.48757582902908325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2580595314502716, + "epoch": 7.64, + "learning_rate": 1.1804733727810652e-05, + "loss": 0.4546, + "step": 9037, + "task_loss": 0.7048425078392029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46541884541511536, + "epoch": 7.64, + "learning_rate": 1.1800507185122572e-05, + "loss": 0.4083, + "step": 9038, + "task_loss": 0.5239202976226807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17622461915016174, + "epoch": 7.64, + "learning_rate": 1.1796280642434488e-05, + "loss": 0.3944, + "step": 9039, + "task_loss": 0.3850150406360626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2719927728176117, + "epoch": 7.64, + "learning_rate": 1.1792054099746408e-05, + "loss": 0.2775, + "step": 9040, + "task_loss": 0.5853669047355652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23604165017604828, + "epoch": 7.64, + "learning_rate": 1.1787827557058327e-05, + "loss": 0.3987, + "step": 9041, + "task_loss": 0.5697468519210815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32700788974761963, + "epoch": 7.64, + "learning_rate": 1.1783601014370246e-05, + "loss": 0.4541, + "step": 9042, + "task_loss": 0.8926465511322021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37619954347610474, + "epoch": 7.64, + "learning_rate": 1.1779374471682164e-05, + "loss": 0.47, + "step": 9043, + "task_loss": 1.0556493997573853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2214282602071762, + "epoch": 7.64, + "learning_rate": 1.1775147928994083e-05, + "loss": 0.3603, + "step": 9044, + "task_loss": 0.8108346462249756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4927966892719269, + "epoch": 7.65, + "learning_rate": 1.1770921386306003e-05, + "loss": 0.4528, + "step": 9045, + "task_loss": 1.1591700315475464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4457417130470276, + "epoch": 7.65, + "learning_rate": 1.1766694843617921e-05, + "loss": 0.4117, + "step": 9046, + "task_loss": 1.0482919216156006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22907936573028564, + "epoch": 7.65, + "learning_rate": 1.1762468300929839e-05, + "loss": 0.4195, + "step": 9047, + "task_loss": 0.510393500328064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33623653650283813, + "epoch": 7.65, + "learning_rate": 1.1758241758241759e-05, + "loss": 0.3139, + "step": 9048, + "task_loss": 0.8420602083206177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4092572033405304, + "epoch": 7.65, + "learning_rate": 1.1754015215553679e-05, + "loss": 0.4391, + "step": 9049, + "task_loss": 0.44004857540130615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2946168780326843, + "epoch": 7.65, + "learning_rate": 1.1749788672865597e-05, + "loss": 0.2832, + "step": 9050, + "task_loss": 0.16051824390888214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4082638919353485, + "epoch": 7.65, + "learning_rate": 1.1745562130177515e-05, + "loss": 0.3427, + "step": 9051, + "task_loss": 0.4675252139568329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34707850217819214, + "epoch": 7.65, + "learning_rate": 1.1741335587489434e-05, + "loss": 0.3775, + "step": 9052, + "task_loss": 0.49460896849632263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4688130021095276, + "epoch": 7.65, + "learning_rate": 1.1737109044801352e-05, + "loss": 0.3211, + "step": 9053, + "task_loss": 0.39294153451919556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3327360153198242, + "epoch": 7.65, + "learning_rate": 1.1732882502113272e-05, + "loss": 0.4655, + "step": 9054, + "task_loss": 0.6813886761665344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3562341332435608, + "epoch": 7.65, + "learning_rate": 1.172865595942519e-05, + "loss": 0.3107, + "step": 9055, + "task_loss": 0.7206686735153198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22212161123752594, + "epoch": 7.65, + "learning_rate": 1.172442941673711e-05, + "loss": 0.3308, + "step": 9056, + "task_loss": 0.5155127048492432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49100273847579956, + "epoch": 7.66, + "learning_rate": 1.1720202874049028e-05, + "loss": 0.3125, + "step": 9057, + "task_loss": 0.8621499538421631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4285551607608795, + "epoch": 7.66, + "learning_rate": 1.1715976331360948e-05, + "loss": 0.3666, + "step": 9058, + "task_loss": 0.2908785045146942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30743879079818726, + "epoch": 7.66, + "learning_rate": 1.1711749788672866e-05, + "loss": 0.3687, + "step": 9059, + "task_loss": 0.2561503052711487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40027230978012085, + "epoch": 7.66, + "learning_rate": 1.1707523245984786e-05, + "loss": 0.3293, + "step": 9060, + "task_loss": 0.19694428145885468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5225579738616943, + "epoch": 7.66, + "learning_rate": 1.1703296703296704e-05, + "loss": 0.4008, + "step": 9061, + "task_loss": 0.5215948224067688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42380574345588684, + "epoch": 7.66, + "learning_rate": 1.1699070160608623e-05, + "loss": 0.4518, + "step": 9062, + "task_loss": 0.3542225658893585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.458027720451355, + "epoch": 7.66, + "learning_rate": 1.1694843617920541e-05, + "loss": 0.4771, + "step": 9063, + "task_loss": 0.98842853307724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8132627606391907, + "epoch": 7.66, + "learning_rate": 1.169061707523246e-05, + "loss": 0.3968, + "step": 9064, + "task_loss": 1.0664492845535278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3299165368080139, + "epoch": 7.66, + "learning_rate": 1.168639053254438e-05, + "loss": 0.3946, + "step": 9065, + "task_loss": 0.9507863521575928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21241885423660278, + "epoch": 7.66, + "learning_rate": 1.1682163989856299e-05, + "loss": 0.2898, + "step": 9066, + "task_loss": 0.19051221013069153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4213418960571289, + "epoch": 7.66, + "learning_rate": 1.1677937447168217e-05, + "loss": 0.3457, + "step": 9067, + "task_loss": 0.9177011251449585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5141689777374268, + "epoch": 7.66, + "learning_rate": 1.1673710904480135e-05, + "loss": 0.4827, + "step": 9068, + "task_loss": 0.5235229730606079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3967817425727844, + "epoch": 7.67, + "learning_rate": 1.1669484361792055e-05, + "loss": 0.4485, + "step": 9069, + "task_loss": 0.23925037682056427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34805598855018616, + "epoch": 7.67, + "learning_rate": 1.1665257819103974e-05, + "loss": 0.4355, + "step": 9070, + "task_loss": 0.8334130048751831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6072264313697815, + "epoch": 7.67, + "learning_rate": 1.1661031276415893e-05, + "loss": 0.4336, + "step": 9071, + "task_loss": 0.21760205924510956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.536931037902832, + "epoch": 7.67, + "learning_rate": 1.165680473372781e-05, + "loss": 0.346, + "step": 9072, + "task_loss": 0.5139984488487244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3435274660587311, + "epoch": 7.67, + "learning_rate": 1.165257819103973e-05, + "loss": 0.5243, + "step": 9073, + "task_loss": 0.4083363115787506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4037647247314453, + "epoch": 7.67, + "learning_rate": 1.1648351648351648e-05, + "loss": 0.3892, + "step": 9074, + "task_loss": 0.2817915678024292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2859882116317749, + "epoch": 7.67, + "learning_rate": 1.1644125105663568e-05, + "loss": 0.4101, + "step": 9075, + "task_loss": 0.303874671459198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46814122796058655, + "epoch": 7.67, + "learning_rate": 1.1639898562975486e-05, + "loss": 0.53, + "step": 9076, + "task_loss": 0.6683230400085449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4919140338897705, + "epoch": 7.67, + "learning_rate": 1.1635672020287406e-05, + "loss": 0.5234, + "step": 9077, + "task_loss": 0.7466844320297241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3255879878997803, + "epoch": 7.67, + "learning_rate": 1.1631445477599324e-05, + "loss": 0.3179, + "step": 9078, + "task_loss": 0.5697146654129028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44893962144851685, + "epoch": 7.67, + "learning_rate": 1.1627218934911244e-05, + "loss": 0.4497, + "step": 9079, + "task_loss": 0.9178148508071899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28595617413520813, + "epoch": 7.67, + "learning_rate": 1.1622992392223162e-05, + "loss": 0.4128, + "step": 9080, + "task_loss": 0.3437766432762146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29790395498275757, + "epoch": 7.68, + "learning_rate": 1.1618765849535081e-05, + "loss": 0.3494, + "step": 9081, + "task_loss": 0.3793984055519104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35919150710105896, + "epoch": 7.68, + "learning_rate": 1.1614539306847e-05, + "loss": 0.2733, + "step": 9082, + "task_loss": 1.0086805820465088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36165302991867065, + "epoch": 7.68, + "learning_rate": 1.161031276415892e-05, + "loss": 0.3277, + "step": 9083, + "task_loss": 0.5117413997650146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4655190706253052, + "epoch": 7.68, + "learning_rate": 1.1606086221470837e-05, + "loss": 0.382, + "step": 9084, + "task_loss": 0.49258899688720703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35827407240867615, + "epoch": 7.68, + "learning_rate": 1.1601859678782755e-05, + "loss": 0.4059, + "step": 9085, + "task_loss": 0.7047827243804932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5570493340492249, + "epoch": 7.68, + "learning_rate": 1.1597633136094675e-05, + "loss": 0.4489, + "step": 9086, + "task_loss": 0.48567521572113037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28169572353363037, + "epoch": 7.68, + "learning_rate": 1.1593406593406595e-05, + "loss": 0.4144, + "step": 9087, + "task_loss": 0.1504969447851181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42657098174095154, + "epoch": 7.68, + "learning_rate": 1.1589180050718513e-05, + "loss": 0.4804, + "step": 9088, + "task_loss": 0.9591243267059326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.585813581943512, + "epoch": 7.68, + "learning_rate": 1.1584953508030431e-05, + "loss": 0.4369, + "step": 9089, + "task_loss": 0.9274604320526123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4261697828769684, + "epoch": 7.68, + "learning_rate": 1.158072696534235e-05, + "loss": 0.4439, + "step": 9090, + "task_loss": 0.80622798204422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4504546821117401, + "epoch": 7.68, + "learning_rate": 1.157650042265427e-05, + "loss": 0.487, + "step": 9091, + "task_loss": 1.0099345445632935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4317294955253601, + "epoch": 7.69, + "learning_rate": 1.1572273879966188e-05, + "loss": 0.3498, + "step": 9092, + "task_loss": 0.5249655842781067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2850191593170166, + "epoch": 7.69, + "learning_rate": 1.1568047337278106e-05, + "loss": 0.4002, + "step": 9093, + "task_loss": 1.681335210800171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48124194145202637, + "epoch": 7.69, + "learning_rate": 1.1563820794590026e-05, + "loss": 0.2992, + "step": 9094, + "task_loss": 0.43444743752479553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38189244270324707, + "epoch": 7.69, + "learning_rate": 1.1559594251901946e-05, + "loss": 0.3871, + "step": 9095, + "task_loss": 0.600250244140625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26120293140411377, + "epoch": 7.69, + "learning_rate": 1.1555367709213864e-05, + "loss": 0.3968, + "step": 9096, + "task_loss": 0.2040611058473587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2957070767879486, + "epoch": 7.69, + "learning_rate": 1.1551141166525782e-05, + "loss": 0.449, + "step": 9097, + "task_loss": 0.41480839252471924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31503891944885254, + "epoch": 7.69, + "learning_rate": 1.1546914623837702e-05, + "loss": 0.5554, + "step": 9098, + "task_loss": 0.9648346900939941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5616080164909363, + "epoch": 7.69, + "learning_rate": 1.154268808114962e-05, + "loss": 0.4653, + "step": 9099, + "task_loss": 0.39072370529174805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28910255432128906, + "epoch": 7.69, + "learning_rate": 1.153846153846154e-05, + "loss": 0.3073, + "step": 9100, + "task_loss": 0.9958504438400269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3099061846733093, + "epoch": 7.69, + "learning_rate": 1.1534234995773458e-05, + "loss": 0.322, + "step": 9101, + "task_loss": 0.23545800149440765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20131778717041016, + "epoch": 7.69, + "learning_rate": 1.1530008453085377e-05, + "loss": 0.3606, + "step": 9102, + "task_loss": 0.21416786313056946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5189632773399353, + "epoch": 7.69, + "learning_rate": 1.1525781910397295e-05, + "loss": 0.4709, + "step": 9103, + "task_loss": 1.019541621208191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.249685600399971, + "epoch": 7.7, + "learning_rate": 1.1521555367709215e-05, + "loss": 0.3352, + "step": 9104, + "task_loss": 0.8379485011100769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30412718653678894, + "epoch": 7.7, + "learning_rate": 1.1517328825021133e-05, + "loss": 0.2994, + "step": 9105, + "task_loss": 0.2942968010902405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19457846879959106, + "epoch": 7.7, + "learning_rate": 1.1513102282333051e-05, + "loss": 0.3293, + "step": 9106, + "task_loss": 0.2578124701976776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38116198778152466, + "epoch": 7.7, + "learning_rate": 1.1508875739644971e-05, + "loss": 0.3621, + "step": 9107, + "task_loss": 0.43714895844459534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3668285012245178, + "epoch": 7.7, + "learning_rate": 1.150464919695689e-05, + "loss": 0.4831, + "step": 9108, + "task_loss": 0.22167527675628662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4199523329734802, + "epoch": 7.7, + "learning_rate": 1.1500422654268809e-05, + "loss": 0.3866, + "step": 9109, + "task_loss": 0.8118433952331543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4506899118423462, + "epoch": 7.7, + "learning_rate": 1.1496196111580727e-05, + "loss": 0.451, + "step": 9110, + "task_loss": 0.9876567125320435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25918519496917725, + "epoch": 7.7, + "learning_rate": 1.1491969568892646e-05, + "loss": 0.3452, + "step": 9111, + "task_loss": 0.4457249343395233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4185684323310852, + "epoch": 7.7, + "learning_rate": 1.1487743026204566e-05, + "loss": 0.3363, + "step": 9112, + "task_loss": 0.31305286288261414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17713600397109985, + "epoch": 7.7, + "learning_rate": 1.1483516483516484e-05, + "loss": 0.2529, + "step": 9113, + "task_loss": 0.6266197562217712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.416693776845932, + "epoch": 7.7, + "learning_rate": 1.1479289940828402e-05, + "loss": 0.469, + "step": 9114, + "task_loss": 0.3884761333465576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20577973127365112, + "epoch": 7.7, + "learning_rate": 1.1475063398140322e-05, + "loss": 0.4823, + "step": 9115, + "task_loss": 0.6675399541854858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2792292535305023, + "epoch": 7.71, + "learning_rate": 1.1470836855452242e-05, + "loss": 0.3514, + "step": 9116, + "task_loss": 0.2170674204826355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.644379198551178, + "epoch": 7.71, + "learning_rate": 1.1466610312764158e-05, + "loss": 0.4996, + "step": 9117, + "task_loss": 0.6043576002120972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3493366241455078, + "epoch": 7.71, + "learning_rate": 1.1462383770076078e-05, + "loss": 0.405, + "step": 9118, + "task_loss": 0.6413025856018066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4722885191440582, + "epoch": 7.71, + "learning_rate": 1.1458157227387998e-05, + "loss": 0.4215, + "step": 9119, + "task_loss": 0.4085198640823364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27123552560806274, + "epoch": 7.71, + "learning_rate": 1.1453930684699916e-05, + "loss": 0.3952, + "step": 9120, + "task_loss": 0.18777170777320862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2510784864425659, + "epoch": 7.71, + "learning_rate": 1.1449704142011834e-05, + "loss": 0.369, + "step": 9121, + "task_loss": 0.6821796894073486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28577256202697754, + "epoch": 7.71, + "learning_rate": 1.1445477599323753e-05, + "loss": 0.3817, + "step": 9122, + "task_loss": 0.797143280506134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29055824875831604, + "epoch": 7.71, + "learning_rate": 1.1441251056635673e-05, + "loss": 0.4995, + "step": 9123, + "task_loss": 0.2320108562707901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43018925189971924, + "epoch": 7.71, + "learning_rate": 1.1437024513947591e-05, + "loss": 0.3703, + "step": 9124, + "task_loss": 1.1138101816177368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4061700403690338, + "epoch": 7.71, + "learning_rate": 1.143279797125951e-05, + "loss": 0.4537, + "step": 9125, + "task_loss": 0.40589240193367004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2310669720172882, + "epoch": 7.71, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.3284, + "step": 9126, + "task_loss": 0.3701762855052948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4331786036491394, + "epoch": 7.71, + "learning_rate": 1.1424344885883349e-05, + "loss": 0.3767, + "step": 9127, + "task_loss": 0.25957462191581726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35757148265838623, + "epoch": 7.72, + "learning_rate": 1.1420118343195267e-05, + "loss": 0.4102, + "step": 9128, + "task_loss": 0.4701923429965973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27205705642700195, + "epoch": 7.72, + "learning_rate": 1.1415891800507187e-05, + "loss": 0.3504, + "step": 9129, + "task_loss": 0.8580575585365295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30253949761390686, + "epoch": 7.72, + "learning_rate": 1.1411665257819105e-05, + "loss": 0.3541, + "step": 9130, + "task_loss": 0.3411189913749695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.311810702085495, + "epoch": 7.72, + "learning_rate": 1.1407438715131023e-05, + "loss": 0.4661, + "step": 9131, + "task_loss": 0.34442612528800964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37009477615356445, + "epoch": 7.72, + "learning_rate": 1.1403212172442942e-05, + "loss": 0.3434, + "step": 9132, + "task_loss": 0.49225327372550964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18273411691188812, + "epoch": 7.72, + "learning_rate": 1.1398985629754862e-05, + "loss": 0.4354, + "step": 9133, + "task_loss": 0.06520560383796692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3126942217350006, + "epoch": 7.72, + "learning_rate": 1.139475908706678e-05, + "loss": 0.4715, + "step": 9134, + "task_loss": 0.8652362823486328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23995766043663025, + "epoch": 7.72, + "learning_rate": 1.1390532544378698e-05, + "loss": 0.3971, + "step": 9135, + "task_loss": 0.9217087626457214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3310176432132721, + "epoch": 7.72, + "learning_rate": 1.1386306001690618e-05, + "loss": 0.4764, + "step": 9136, + "task_loss": 0.9857785701751709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29179105162620544, + "epoch": 7.72, + "learning_rate": 1.1382079459002538e-05, + "loss": 0.4123, + "step": 9137, + "task_loss": 0.035466741770505905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33696579933166504, + "epoch": 7.72, + "learning_rate": 1.1377852916314454e-05, + "loss": 0.3377, + "step": 9138, + "task_loss": 0.30561596155166626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3339374363422394, + "epoch": 7.72, + "learning_rate": 1.1373626373626374e-05, + "loss": 0.5173, + "step": 9139, + "task_loss": 0.6568589210510254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7359925508499146, + "epoch": 7.73, + "learning_rate": 1.1369399830938294e-05, + "loss": 0.57, + "step": 9140, + "task_loss": 0.9451598525047302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3499267101287842, + "epoch": 7.73, + "learning_rate": 1.1365173288250212e-05, + "loss": 0.3838, + "step": 9141, + "task_loss": 0.5426594018936157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33579370379447937, + "epoch": 7.73, + "learning_rate": 1.136094674556213e-05, + "loss": 0.4124, + "step": 9142, + "task_loss": 0.8392094373703003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36561277508735657, + "epoch": 7.73, + "learning_rate": 1.135672020287405e-05, + "loss": 0.5608, + "step": 9143, + "task_loss": 0.7134408354759216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32397326827049255, + "epoch": 7.73, + "learning_rate": 1.1352493660185969e-05, + "loss": 0.4113, + "step": 9144, + "task_loss": 1.3163927793502808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42200130224227905, + "epoch": 7.73, + "learning_rate": 1.1348267117497887e-05, + "loss": 0.3092, + "step": 9145, + "task_loss": 0.6292231678962708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2413388192653656, + "epoch": 7.73, + "learning_rate": 1.1344040574809805e-05, + "loss": 0.309, + "step": 9146, + "task_loss": 0.23417268693447113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7151644229888916, + "epoch": 7.73, + "learning_rate": 1.1339814032121725e-05, + "loss": 0.3718, + "step": 9147, + "task_loss": 0.6130445599555969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35301852226257324, + "epoch": 7.73, + "learning_rate": 1.1335587489433645e-05, + "loss": 0.4311, + "step": 9148, + "task_loss": 0.8772244453430176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41327613592147827, + "epoch": 7.73, + "learning_rate": 1.1331360946745563e-05, + "loss": 0.33, + "step": 9149, + "task_loss": 0.3825591206550598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2858465313911438, + "epoch": 7.73, + "learning_rate": 1.132713440405748e-05, + "loss": 0.3447, + "step": 9150, + "task_loss": 0.6044866442680359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.360195517539978, + "epoch": 7.73, + "learning_rate": 1.13229078613694e-05, + "loss": 0.4387, + "step": 9151, + "task_loss": 1.3296058177947998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5096611380577087, + "epoch": 7.74, + "learning_rate": 1.1318681318681319e-05, + "loss": 0.3822, + "step": 9152, + "task_loss": 0.41109979152679443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.375434935092926, + "epoch": 7.74, + "learning_rate": 1.1314454775993238e-05, + "loss": 0.471, + "step": 9153, + "task_loss": 0.08674095571041107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28952664136886597, + "epoch": 7.74, + "learning_rate": 1.1310228233305156e-05, + "loss": 0.3473, + "step": 9154, + "task_loss": 0.633596658706665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49977564811706543, + "epoch": 7.74, + "learning_rate": 1.1306001690617076e-05, + "loss": 0.4484, + "step": 9155, + "task_loss": 0.6473640203475952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31836628913879395, + "epoch": 7.74, + "learning_rate": 1.1301775147928994e-05, + "loss": 0.3315, + "step": 9156, + "task_loss": 0.5128940939903259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39224445819854736, + "epoch": 7.74, + "learning_rate": 1.1297548605240914e-05, + "loss": 0.4006, + "step": 9157, + "task_loss": 0.6004794239997864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2795179784297943, + "epoch": 7.74, + "learning_rate": 1.1293322062552834e-05, + "loss": 0.3781, + "step": 9158, + "task_loss": 1.411116123199463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30549338459968567, + "epoch": 7.74, + "learning_rate": 1.1289095519864752e-05, + "loss": 0.378, + "step": 9159, + "task_loss": 0.46297594904899597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.527697741985321, + "epoch": 7.74, + "learning_rate": 1.128486897717667e-05, + "loss": 0.4234, + "step": 9160, + "task_loss": 0.6183108687400818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3122134804725647, + "epoch": 7.74, + "learning_rate": 1.128064243448859e-05, + "loss": 0.452, + "step": 9161, + "task_loss": 0.4861637055873871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.453617662191391, + "epoch": 7.74, + "learning_rate": 1.1276415891800509e-05, + "loss": 0.5029, + "step": 9162, + "task_loss": 1.1435117721557617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43289196491241455, + "epoch": 7.75, + "learning_rate": 1.1272189349112425e-05, + "loss": 0.4259, + "step": 9163, + "task_loss": 0.2819502651691437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28781914710998535, + "epoch": 7.75, + "learning_rate": 1.1267962806424345e-05, + "loss": 0.408, + "step": 9164, + "task_loss": 0.6944119930267334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2532561719417572, + "epoch": 7.75, + "learning_rate": 1.1263736263736265e-05, + "loss": 0.3105, + "step": 9165, + "task_loss": 0.24177269637584686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1595858931541443, + "epoch": 7.75, + "learning_rate": 1.1259509721048183e-05, + "loss": 0.283, + "step": 9166, + "task_loss": 0.057622428983449936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.602941632270813, + "epoch": 7.75, + "learning_rate": 1.1255283178360101e-05, + "loss": 0.4609, + "step": 9167, + "task_loss": 1.3158483505249023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2732189893722534, + "epoch": 7.75, + "learning_rate": 1.125105663567202e-05, + "loss": 0.3588, + "step": 9168, + "task_loss": 0.4310460388660431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23886744678020477, + "epoch": 7.75, + "learning_rate": 1.124683009298394e-05, + "loss": 0.288, + "step": 9169, + "task_loss": 0.29517704248428345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6346036195755005, + "epoch": 7.75, + "learning_rate": 1.1242603550295859e-05, + "loss": 0.4723, + "step": 9170, + "task_loss": 1.07674241065979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4503568112850189, + "epoch": 7.75, + "learning_rate": 1.1238377007607777e-05, + "loss": 0.4243, + "step": 9171, + "task_loss": 0.843457818031311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49291354417800903, + "epoch": 7.75, + "learning_rate": 1.1234150464919696e-05, + "loss": 0.4708, + "step": 9172, + "task_loss": 0.3602895438671112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4171884059906006, + "epoch": 7.75, + "learning_rate": 1.1229923922231614e-05, + "loss": 0.4086, + "step": 9173, + "task_loss": 0.33604615926742554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3416578769683838, + "epoch": 7.75, + "learning_rate": 1.1225697379543534e-05, + "loss": 0.3545, + "step": 9174, + "task_loss": 0.47816404700279236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3527067303657532, + "epoch": 7.76, + "learning_rate": 1.1221470836855452e-05, + "loss": 0.397, + "step": 9175, + "task_loss": 0.38562795519828796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5763547420501709, + "epoch": 7.76, + "learning_rate": 1.1217244294167372e-05, + "loss": 0.4067, + "step": 9176, + "task_loss": 0.46796807646751404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2539387345314026, + "epoch": 7.76, + "learning_rate": 1.121301775147929e-05, + "loss": 0.4237, + "step": 9177, + "task_loss": 0.447013795375824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41170811653137207, + "epoch": 7.76, + "learning_rate": 1.120879120879121e-05, + "loss": 0.4114, + "step": 9178, + "task_loss": 0.5371567606925964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26679712533950806, + "epoch": 7.76, + "learning_rate": 1.1204564666103128e-05, + "loss": 0.3533, + "step": 9179, + "task_loss": 0.3341583013534546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5373728275299072, + "epoch": 7.76, + "learning_rate": 1.1200338123415047e-05, + "loss": 0.4948, + "step": 9180, + "task_loss": 0.5498508810997009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23268887400627136, + "epoch": 7.76, + "learning_rate": 1.1196111580726966e-05, + "loss": 0.4503, + "step": 9181, + "task_loss": 0.30625489354133606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22331435978412628, + "epoch": 7.76, + "learning_rate": 1.1191885038038885e-05, + "loss": 0.4099, + "step": 9182, + "task_loss": 0.6334330439567566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3860294818878174, + "epoch": 7.76, + "learning_rate": 1.1187658495350803e-05, + "loss": 0.3311, + "step": 9183, + "task_loss": 0.9726356267929077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39235448837280273, + "epoch": 7.76, + "learning_rate": 1.1183431952662721e-05, + "loss": 0.3361, + "step": 9184, + "task_loss": 0.7921847701072693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3648669123649597, + "epoch": 7.76, + "learning_rate": 1.1179205409974641e-05, + "loss": 0.412, + "step": 9185, + "task_loss": 0.4138137698173523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4430561661720276, + "epoch": 7.76, + "learning_rate": 1.117497886728656e-05, + "loss": 0.327, + "step": 9186, + "task_loss": 0.777010440826416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3377532362937927, + "epoch": 7.77, + "learning_rate": 1.1170752324598479e-05, + "loss": 0.3774, + "step": 9187, + "task_loss": 0.3540569245815277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2534089684486389, + "epoch": 7.77, + "learning_rate": 1.1166525781910397e-05, + "loss": 0.3343, + "step": 9188, + "task_loss": 0.20269900560379028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2634657621383667, + "epoch": 7.77, + "learning_rate": 1.1162299239222317e-05, + "loss": 0.3443, + "step": 9189, + "task_loss": 0.12720976769924164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2876060903072357, + "epoch": 7.77, + "learning_rate": 1.1158072696534236e-05, + "loss": 0.4041, + "step": 9190, + "task_loss": 0.16513416171073914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6772826313972473, + "epoch": 7.77, + "learning_rate": 1.1153846153846154e-05, + "loss": 0.3405, + "step": 9191, + "task_loss": 0.8949530124664307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29366540908813477, + "epoch": 7.77, + "learning_rate": 1.1149619611158072e-05, + "loss": 0.3496, + "step": 9192, + "task_loss": 0.1253514289855957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38304272294044495, + "epoch": 7.77, + "learning_rate": 1.1145393068469992e-05, + "loss": 0.3894, + "step": 9193, + "task_loss": 0.4890056848526001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5221858620643616, + "epoch": 7.77, + "learning_rate": 1.1141166525781912e-05, + "loss": 0.4178, + "step": 9194, + "task_loss": 0.8837781548500061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1953422725200653, + "epoch": 7.77, + "learning_rate": 1.113693998309383e-05, + "loss": 0.4303, + "step": 9195, + "task_loss": 0.48021215200424194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20505984127521515, + "epoch": 7.77, + "learning_rate": 1.1132713440405748e-05, + "loss": 0.3264, + "step": 9196, + "task_loss": 0.8538981080055237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3823113739490509, + "epoch": 7.77, + "learning_rate": 1.1128486897717668e-05, + "loss": 0.4258, + "step": 9197, + "task_loss": 0.9584882259368896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.436233788728714, + "epoch": 7.77, + "learning_rate": 1.1124260355029586e-05, + "loss": 0.349, + "step": 9198, + "task_loss": 0.11669134348630905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27526962757110596, + "epoch": 7.78, + "learning_rate": 1.1120033812341506e-05, + "loss": 0.3592, + "step": 9199, + "task_loss": 0.2844611406326294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3739403784275055, + "epoch": 7.78, + "learning_rate": 1.1115807269653424e-05, + "loss": 0.4449, + "step": 9200, + "task_loss": 1.0262497663497925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33324164152145386, + "epoch": 7.78, + "learning_rate": 1.1111580726965343e-05, + "loss": 0.4403, + "step": 9201, + "task_loss": 0.46012914180755615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2956051826477051, + "epoch": 7.78, + "learning_rate": 1.1107354184277261e-05, + "loss": 0.3194, + "step": 9202, + "task_loss": 0.8431546092033386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42520707845687866, + "epoch": 7.78, + "learning_rate": 1.1103127641589181e-05, + "loss": 0.3514, + "step": 9203, + "task_loss": 1.8556407690048218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3355869948863983, + "epoch": 7.78, + "learning_rate": 1.10989010989011e-05, + "loss": 0.3233, + "step": 9204, + "task_loss": 0.34708765149116516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37278473377227783, + "epoch": 7.78, + "learning_rate": 1.1094674556213017e-05, + "loss": 0.4773, + "step": 9205, + "task_loss": 0.42556461691856384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5236413478851318, + "epoch": 7.78, + "learning_rate": 1.1090448013524937e-05, + "loss": 0.4396, + "step": 9206, + "task_loss": 0.5779834389686584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3706008195877075, + "epoch": 7.78, + "learning_rate": 1.1086221470836857e-05, + "loss": 0.4711, + "step": 9207, + "task_loss": 0.6224589347839355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3697749972343445, + "epoch": 7.78, + "learning_rate": 1.1081994928148775e-05, + "loss": 0.367, + "step": 9208, + "task_loss": 0.3714015781879425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38036292791366577, + "epoch": 7.78, + "learning_rate": 1.1077768385460693e-05, + "loss": 0.4015, + "step": 9209, + "task_loss": 0.43919825553894043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44260141253471375, + "epoch": 7.78, + "learning_rate": 1.1073541842772613e-05, + "loss": 0.4259, + "step": 9210, + "task_loss": 0.6497274041175842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47996985912323, + "epoch": 7.79, + "learning_rate": 1.1069315300084532e-05, + "loss": 0.4164, + "step": 9211, + "task_loss": 0.3929479122161865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6918481588363647, + "epoch": 7.79, + "learning_rate": 1.106508875739645e-05, + "loss": 0.5524, + "step": 9212, + "task_loss": 0.9093455076217651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30319127440452576, + "epoch": 7.79, + "learning_rate": 1.1060862214708368e-05, + "loss": 0.373, + "step": 9213, + "task_loss": 0.055270709097385406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29538580775260925, + "epoch": 7.79, + "learning_rate": 1.1056635672020288e-05, + "loss": 0.554, + "step": 9214, + "task_loss": 0.2552472949028015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3128582835197449, + "epoch": 7.79, + "learning_rate": 1.1052409129332208e-05, + "loss": 0.352, + "step": 9215, + "task_loss": 0.24531985819339752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5729312896728516, + "epoch": 7.79, + "learning_rate": 1.1048182586644126e-05, + "loss": 0.3462, + "step": 9216, + "task_loss": 0.6595221757888794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2506973147392273, + "epoch": 7.79, + "learning_rate": 1.1043956043956044e-05, + "loss": 0.3554, + "step": 9217, + "task_loss": 0.49035191535949707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.605094850063324, + "epoch": 7.79, + "learning_rate": 1.1039729501267964e-05, + "loss": 0.4557, + "step": 9218, + "task_loss": 0.6452468633651733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5192772746086121, + "epoch": 7.79, + "learning_rate": 1.1035502958579882e-05, + "loss": 0.3782, + "step": 9219, + "task_loss": 0.08161477744579315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1927189826965332, + "epoch": 7.79, + "learning_rate": 1.1031276415891801e-05, + "loss": 0.2816, + "step": 9220, + "task_loss": 0.5210713744163513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3268837332725525, + "epoch": 7.79, + "learning_rate": 1.102704987320372e-05, + "loss": 0.437, + "step": 9221, + "task_loss": 0.2795208692550659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38489678502082825, + "epoch": 7.79, + "learning_rate": 1.102282333051564e-05, + "loss": 0.3736, + "step": 9222, + "task_loss": 1.000780463218689 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3962850570678711, + "epoch": 7.8, + "learning_rate": 1.1018596787827557e-05, + "loss": 0.3243, + "step": 9223, + "task_loss": 0.8263521194458008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19328072667121887, + "epoch": 7.8, + "learning_rate": 1.1014370245139477e-05, + "loss": 0.3702, + "step": 9224, + "task_loss": 0.7977550029754639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.782816469669342, + "epoch": 7.8, + "learning_rate": 1.1010143702451395e-05, + "loss": 0.4765, + "step": 9225, + "task_loss": 0.9350370168685913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6613709926605225, + "epoch": 7.8, + "learning_rate": 1.1005917159763315e-05, + "loss": 0.527, + "step": 9226, + "task_loss": 1.4070037603378296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4617674946784973, + "epoch": 7.8, + "learning_rate": 1.1001690617075233e-05, + "loss": 0.4533, + "step": 9227, + "task_loss": 1.694656491279602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3281554877758026, + "epoch": 7.8, + "learning_rate": 1.0997464074387153e-05, + "loss": 0.4368, + "step": 9228, + "task_loss": 0.40840476751327515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4342308044433594, + "epoch": 7.8, + "learning_rate": 1.099323753169907e-05, + "loss": 0.3637, + "step": 9229, + "task_loss": 0.6556279063224792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36412039399147034, + "epoch": 7.8, + "learning_rate": 1.0989010989010989e-05, + "loss": 0.4814, + "step": 9230, + "task_loss": 0.44055911898612976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4204012453556061, + "epoch": 7.8, + "learning_rate": 1.0984784446322908e-05, + "loss": 0.3378, + "step": 9231, + "task_loss": 0.2857893705368042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5335394144058228, + "epoch": 7.8, + "learning_rate": 1.0980557903634828e-05, + "loss": 0.4346, + "step": 9232, + "task_loss": 0.5185115337371826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3753792643547058, + "epoch": 7.8, + "learning_rate": 1.0976331360946746e-05, + "loss": 0.3553, + "step": 9233, + "task_loss": 0.9264285564422607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20593991875648499, + "epoch": 7.81, + "learning_rate": 1.0972104818258664e-05, + "loss": 0.3257, + "step": 9234, + "task_loss": 0.3912558853626251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2786403000354767, + "epoch": 7.81, + "learning_rate": 1.0967878275570584e-05, + "loss": 0.4595, + "step": 9235, + "task_loss": 0.7157214283943176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4143175482749939, + "epoch": 7.81, + "learning_rate": 1.0963651732882504e-05, + "loss": 0.4229, + "step": 9236, + "task_loss": 0.24216699600219727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.50058913230896, + "epoch": 7.81, + "learning_rate": 1.095942519019442e-05, + "loss": 0.3806, + "step": 9237, + "task_loss": 1.2244868278503418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3064241111278534, + "epoch": 7.81, + "learning_rate": 1.095519864750634e-05, + "loss": 0.3839, + "step": 9238, + "task_loss": 0.25598084926605225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4368259012699127, + "epoch": 7.81, + "learning_rate": 1.095097210481826e-05, + "loss": 0.2959, + "step": 9239, + "task_loss": 0.39154309034347534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.10724813491106033, + "epoch": 7.81, + "learning_rate": 1.094674556213018e-05, + "loss": 0.4351, + "step": 9240, + "task_loss": 0.007359111215919256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23296202719211578, + "epoch": 7.81, + "learning_rate": 1.0942519019442096e-05, + "loss": 0.4339, + "step": 9241, + "task_loss": 0.09631559997797012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43050095438957214, + "epoch": 7.81, + "learning_rate": 1.0938292476754015e-05, + "loss": 0.3819, + "step": 9242, + "task_loss": 0.5619154572486877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24690952897071838, + "epoch": 7.81, + "learning_rate": 1.0934065934065935e-05, + "loss": 0.3671, + "step": 9243, + "task_loss": 0.3900529146194458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3097602427005768, + "epoch": 7.81, + "learning_rate": 1.0929839391377853e-05, + "loss": 0.3543, + "step": 9244, + "task_loss": 0.7247515320777893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6165492534637451, + "epoch": 7.81, + "learning_rate": 1.0925612848689773e-05, + "loss": 0.5918, + "step": 9245, + "task_loss": 0.8551908731460571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2913127839565277, + "epoch": 7.82, + "learning_rate": 1.0921386306001691e-05, + "loss": 0.4532, + "step": 9246, + "task_loss": 0.057152606546878815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.343362957239151, + "epoch": 7.82, + "learning_rate": 1.091715976331361e-05, + "loss": 0.392, + "step": 9247, + "task_loss": 1.0392510890960693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3892924189567566, + "epoch": 7.82, + "learning_rate": 1.0912933220625529e-05, + "loss": 0.4225, + "step": 9248, + "task_loss": 0.7380716800689697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5509169101715088, + "epoch": 7.82, + "learning_rate": 1.0908706677937448e-05, + "loss": 0.4196, + "step": 9249, + "task_loss": 1.1596014499664307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3979646563529968, + "epoch": 7.82, + "learning_rate": 1.0904480135249366e-05, + "loss": 0.4497, + "step": 9250, + "task_loss": 0.6002991795539856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25178012251853943, + "epoch": 7.82, + "learning_rate": 1.0900253592561285e-05, + "loss": 0.5752, + "step": 9251, + "task_loss": 0.5575987100601196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27283474802970886, + "epoch": 7.82, + "learning_rate": 1.0896027049873204e-05, + "loss": 0.3042, + "step": 9252, + "task_loss": 0.8232729434967041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33235543966293335, + "epoch": 7.82, + "learning_rate": 1.0891800507185124e-05, + "loss": 0.3792, + "step": 9253, + "task_loss": 0.41671988368034363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27926045656204224, + "epoch": 7.82, + "learning_rate": 1.0887573964497042e-05, + "loss": 0.3499, + "step": 9254, + "task_loss": 0.3936062157154083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35859912633895874, + "epoch": 7.82, + "learning_rate": 1.088334742180896e-05, + "loss": 0.2846, + "step": 9255, + "task_loss": 0.7740437388420105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17286600172519684, + "epoch": 7.82, + "learning_rate": 1.087912087912088e-05, + "loss": 0.2555, + "step": 9256, + "task_loss": 0.18293415009975433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36977535486221313, + "epoch": 7.82, + "learning_rate": 1.08748943364328e-05, + "loss": 0.3573, + "step": 9257, + "task_loss": 0.9208285808563232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.326702356338501, + "epoch": 7.83, + "learning_rate": 1.0870667793744718e-05, + "loss": 0.3987, + "step": 9258, + "task_loss": 0.23134936392307281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5045897364616394, + "epoch": 7.83, + "learning_rate": 1.0866441251056636e-05, + "loss": 0.462, + "step": 9259, + "task_loss": 0.6422290205955505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4039265513420105, + "epoch": 7.83, + "learning_rate": 1.0862214708368555e-05, + "loss": 0.4371, + "step": 9260, + "task_loss": 0.5250632166862488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5410283803939819, + "epoch": 7.83, + "learning_rate": 1.0857988165680475e-05, + "loss": 0.3873, + "step": 9261, + "task_loss": 0.8205486536026001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3859993517398834, + "epoch": 7.83, + "learning_rate": 1.0853761622992391e-05, + "loss": 0.478, + "step": 9262, + "task_loss": 0.8574113845825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4508819282054901, + "epoch": 7.83, + "learning_rate": 1.0849535080304311e-05, + "loss": 0.3396, + "step": 9263, + "task_loss": 0.7825176119804382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3347051441669464, + "epoch": 7.83, + "learning_rate": 1.0845308537616231e-05, + "loss": 0.2939, + "step": 9264, + "task_loss": 0.7098431587219238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23411542177200317, + "epoch": 7.83, + "learning_rate": 1.0841081994928149e-05, + "loss": 0.4148, + "step": 9265, + "task_loss": 0.46298152208328247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4776533544063568, + "epoch": 7.83, + "learning_rate": 1.0836855452240067e-05, + "loss": 0.4466, + "step": 9266, + "task_loss": 0.21664302051067352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3806421756744385, + "epoch": 7.83, + "learning_rate": 1.0832628909551987e-05, + "loss": 0.3549, + "step": 9267, + "task_loss": 0.9924259781837463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3687785267829895, + "epoch": 7.83, + "learning_rate": 1.0828402366863907e-05, + "loss": 0.3995, + "step": 9268, + "task_loss": 1.108720064163208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24215102195739746, + "epoch": 7.83, + "learning_rate": 1.0824175824175825e-05, + "loss": 0.2925, + "step": 9269, + "task_loss": 0.16937589645385742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45767706632614136, + "epoch": 7.84, + "learning_rate": 1.0819949281487743e-05, + "loss": 0.4664, + "step": 9270, + "task_loss": 0.51558917760849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3040665090084076, + "epoch": 7.84, + "learning_rate": 1.0815722738799662e-05, + "loss": 0.381, + "step": 9271, + "task_loss": 0.621455192565918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6382224559783936, + "epoch": 7.84, + "learning_rate": 1.0811496196111582e-05, + "loss": 0.5193, + "step": 9272, + "task_loss": 1.1030546426773071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.436740905046463, + "epoch": 7.84, + "learning_rate": 1.08072696534235e-05, + "loss": 0.4669, + "step": 9273, + "task_loss": 0.46199971437454224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2162485420703888, + "epoch": 7.84, + "learning_rate": 1.080304311073542e-05, + "loss": 0.2986, + "step": 9274, + "task_loss": 0.7285430431365967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7007200717926025, + "epoch": 7.84, + "learning_rate": 1.0798816568047338e-05, + "loss": 0.5333, + "step": 9275, + "task_loss": 1.3763097524642944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33861714601516724, + "epoch": 7.84, + "learning_rate": 1.0794590025359256e-05, + "loss": 0.3324, + "step": 9276, + "task_loss": 0.49422213435173035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41579142212867737, + "epoch": 7.84, + "learning_rate": 1.0790363482671176e-05, + "loss": 0.3838, + "step": 9277, + "task_loss": 0.4389484226703644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39072635769844055, + "epoch": 7.84, + "learning_rate": 1.0786136939983095e-05, + "loss": 0.3572, + "step": 9278, + "task_loss": 0.9798759818077087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4454267621040344, + "epoch": 7.84, + "learning_rate": 1.0781910397295014e-05, + "loss": 0.4148, + "step": 9279, + "task_loss": 0.9816865921020508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3471112847328186, + "epoch": 7.84, + "learning_rate": 1.0777683854606932e-05, + "loss": 0.3311, + "step": 9280, + "task_loss": 0.7161145210266113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3434704840183258, + "epoch": 7.84, + "learning_rate": 1.0773457311918851e-05, + "loss": 0.2772, + "step": 9281, + "task_loss": 0.3031379282474518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2679635286331177, + "epoch": 7.85, + "learning_rate": 1.0769230769230771e-05, + "loss": 0.3875, + "step": 9282, + "task_loss": 0.33075010776519775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3778381943702698, + "epoch": 7.85, + "learning_rate": 1.0765004226542687e-05, + "loss": 0.2594, + "step": 9283, + "task_loss": 0.24884802103042603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6224073767662048, + "epoch": 7.85, + "learning_rate": 1.0760777683854607e-05, + "loss": 0.412, + "step": 9284, + "task_loss": 1.0142754316329956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3548678457736969, + "epoch": 7.85, + "learning_rate": 1.0756551141166527e-05, + "loss": 0.4708, + "step": 9285, + "task_loss": 0.3485974371433258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2104184627532959, + "epoch": 7.85, + "learning_rate": 1.0752324598478445e-05, + "loss": 0.3173, + "step": 9286, + "task_loss": 0.19081072509288788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3008164167404175, + "epoch": 7.85, + "learning_rate": 1.0748098055790363e-05, + "loss": 0.3849, + "step": 9287, + "task_loss": 0.10594374686479568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39720824360847473, + "epoch": 7.85, + "learning_rate": 1.0743871513102283e-05, + "loss": 0.3885, + "step": 9288, + "task_loss": 0.7695749402046204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35902607440948486, + "epoch": 7.85, + "learning_rate": 1.0739644970414202e-05, + "loss": 0.3734, + "step": 9289, + "task_loss": 0.2672860622406006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3068811297416687, + "epoch": 7.85, + "learning_rate": 1.073541842772612e-05, + "loss": 0.4346, + "step": 9290, + "task_loss": 0.7562078237533569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22187155485153198, + "epoch": 7.85, + "learning_rate": 1.0731191885038039e-05, + "loss": 0.2723, + "step": 9291, + "task_loss": 0.38457921147346497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33657142519950867, + "epoch": 7.85, + "learning_rate": 1.0726965342349958e-05, + "loss": 0.2996, + "step": 9292, + "task_loss": 0.9042322635650635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5300925970077515, + "epoch": 7.85, + "learning_rate": 1.0722738799661878e-05, + "loss": 0.4183, + "step": 9293, + "task_loss": 0.6678743958473206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28959164023399353, + "epoch": 7.86, + "learning_rate": 1.0718512256973796e-05, + "loss": 0.3758, + "step": 9294, + "task_loss": 0.8730827569961548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3192163109779358, + "epoch": 7.86, + "learning_rate": 1.0714285714285714e-05, + "loss": 0.3226, + "step": 9295, + "task_loss": 0.7750691771507263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24617919325828552, + "epoch": 7.86, + "learning_rate": 1.0710059171597634e-05, + "loss": 0.3409, + "step": 9296, + "task_loss": 0.3111129701137543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5864434242248535, + "epoch": 7.86, + "learning_rate": 1.0705832628909552e-05, + "loss": 0.4895, + "step": 9297, + "task_loss": 0.3661956191062927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41226378083229065, + "epoch": 7.86, + "learning_rate": 1.0701606086221472e-05, + "loss": 0.3716, + "step": 9298, + "task_loss": 1.0340471267700195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5685677528381348, + "epoch": 7.86, + "learning_rate": 1.069737954353339e-05, + "loss": 0.4621, + "step": 9299, + "task_loss": 0.9203925132751465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.390229195356369, + "epoch": 7.86, + "learning_rate": 1.069315300084531e-05, + "loss": 0.3804, + "step": 9300, + "task_loss": 0.5961461663246155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4006960391998291, + "epoch": 7.86, + "learning_rate": 1.0688926458157227e-05, + "loss": 0.3864, + "step": 9301, + "task_loss": 1.086032748222351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4856417179107666, + "epoch": 7.86, + "learning_rate": 1.0684699915469147e-05, + "loss": 0.3861, + "step": 9302, + "task_loss": 0.8427696824073792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4628952741622925, + "epoch": 7.86, + "learning_rate": 1.0680473372781065e-05, + "loss": 0.3843, + "step": 9303, + "task_loss": 0.6110355257987976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27146050333976746, + "epoch": 7.86, + "learning_rate": 1.0676246830092985e-05, + "loss": 0.4831, + "step": 9304, + "task_loss": 0.5670109987258911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4627695083618164, + "epoch": 7.87, + "learning_rate": 1.0672020287404903e-05, + "loss": 0.4282, + "step": 9305, + "task_loss": 0.26529747247695923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2669185698032379, + "epoch": 7.87, + "learning_rate": 1.0667793744716823e-05, + "loss": 0.4186, + "step": 9306, + "task_loss": 0.7672763466835022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2834002375602722, + "epoch": 7.87, + "learning_rate": 1.0663567202028742e-05, + "loss": 0.3756, + "step": 9307, + "task_loss": 0.3437587320804596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5161275863647461, + "epoch": 7.87, + "learning_rate": 1.0659340659340659e-05, + "loss": 0.3767, + "step": 9308, + "task_loss": 0.9973774552345276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34854525327682495, + "epoch": 7.87, + "learning_rate": 1.0655114116652579e-05, + "loss": 0.4634, + "step": 9309, + "task_loss": 0.3313879668712616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2869054973125458, + "epoch": 7.87, + "learning_rate": 1.0650887573964498e-05, + "loss": 0.449, + "step": 9310, + "task_loss": 0.50322026014328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39097028970718384, + "epoch": 7.87, + "learning_rate": 1.0646661031276416e-05, + "loss": 0.4056, + "step": 9311, + "task_loss": 0.6332195401191711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46916520595550537, + "epoch": 7.87, + "learning_rate": 1.0642434488588334e-05, + "loss": 0.564, + "step": 9312, + "task_loss": 0.6158884167671204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21863512694835663, + "epoch": 7.87, + "learning_rate": 1.0638207945900254e-05, + "loss": 0.3167, + "step": 9313, + "task_loss": 0.37384873628616333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25401031970977783, + "epoch": 7.87, + "learning_rate": 1.0633981403212174e-05, + "loss": 0.4379, + "step": 9314, + "task_loss": 0.6856475472450256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43018198013305664, + "epoch": 7.87, + "learning_rate": 1.0629754860524092e-05, + "loss": 0.458, + "step": 9315, + "task_loss": 0.9519250988960266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28802549839019775, + "epoch": 7.87, + "learning_rate": 1.062552831783601e-05, + "loss": 0.4003, + "step": 9316, + "task_loss": 0.8300285935401917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35494333505630493, + "epoch": 7.88, + "learning_rate": 1.062130177514793e-05, + "loss": 0.4602, + "step": 9317, + "task_loss": 0.5488393306732178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28769451379776, + "epoch": 7.88, + "learning_rate": 1.0617075232459848e-05, + "loss": 0.3754, + "step": 9318, + "task_loss": 0.5718137621879578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26615551114082336, + "epoch": 7.88, + "learning_rate": 1.0612848689771767e-05, + "loss": 0.3624, + "step": 9319, + "task_loss": 0.8932967782020569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28873610496520996, + "epoch": 7.88, + "learning_rate": 1.0608622147083686e-05, + "loss": 0.3625, + "step": 9320, + "task_loss": 0.8059813380241394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3873116970062256, + "epoch": 7.88, + "learning_rate": 1.0604395604395605e-05, + "loss": 0.4202, + "step": 9321, + "task_loss": 0.3786969482898712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5934972167015076, + "epoch": 7.88, + "learning_rate": 1.0600169061707523e-05, + "loss": 0.4451, + "step": 9322, + "task_loss": 0.5832181572914124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24798718094825745, + "epoch": 7.88, + "learning_rate": 1.0595942519019443e-05, + "loss": 0.3366, + "step": 9323, + "task_loss": 0.44672125577926636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30515140295028687, + "epoch": 7.88, + "learning_rate": 1.0591715976331361e-05, + "loss": 0.4542, + "step": 9324, + "task_loss": 1.3762881755828857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3264380097389221, + "epoch": 7.88, + "learning_rate": 1.058748943364328e-05, + "loss": 0.3329, + "step": 9325, + "task_loss": 0.14306579530239105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29918310046195984, + "epoch": 7.88, + "learning_rate": 1.0583262890955199e-05, + "loss": 0.3892, + "step": 9326, + "task_loss": 0.3864036798477173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3965027928352356, + "epoch": 7.88, + "learning_rate": 1.0579036348267119e-05, + "loss": 0.4572, + "step": 9327, + "task_loss": 0.41052335500717163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.393541157245636, + "epoch": 7.88, + "learning_rate": 1.0574809805579037e-05, + "loss": 0.3673, + "step": 9328, + "task_loss": 0.9022150039672852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20806947350502014, + "epoch": 7.89, + "learning_rate": 1.0570583262890955e-05, + "loss": 0.3604, + "step": 9329, + "task_loss": 0.6552309989929199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21843647956848145, + "epoch": 7.89, + "learning_rate": 1.0566356720202874e-05, + "loss": 0.3247, + "step": 9330, + "task_loss": 0.7550066709518433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3441922068595886, + "epoch": 7.89, + "learning_rate": 1.0562130177514794e-05, + "loss": 0.3912, + "step": 9331, + "task_loss": 0.32267022132873535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2799738049507141, + "epoch": 7.89, + "learning_rate": 1.0557903634826712e-05, + "loss": 0.3771, + "step": 9332, + "task_loss": 0.6509575247764587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48090341687202454, + "epoch": 7.89, + "learning_rate": 1.055367709213863e-05, + "loss": 0.38, + "step": 9333, + "task_loss": 1.0719225406646729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.345784068107605, + "epoch": 7.89, + "learning_rate": 1.054945054945055e-05, + "loss": 0.3091, + "step": 9334, + "task_loss": 0.705093502998352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31461939215660095, + "epoch": 7.89, + "learning_rate": 1.054522400676247e-05, + "loss": 0.4917, + "step": 9335, + "task_loss": 0.3675152063369751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2938218116760254, + "epoch": 7.89, + "learning_rate": 1.0540997464074388e-05, + "loss": 0.4663, + "step": 9336, + "task_loss": 1.1813033819198608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3476431667804718, + "epoch": 7.89, + "learning_rate": 1.0536770921386306e-05, + "loss": 0.289, + "step": 9337, + "task_loss": 0.6157872080802917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.668760359287262, + "epoch": 7.89, + "learning_rate": 1.0532544378698226e-05, + "loss": 0.5169, + "step": 9338, + "task_loss": 1.2601101398468018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36466726660728455, + "epoch": 7.89, + "learning_rate": 1.0528317836010145e-05, + "loss": 0.383, + "step": 9339, + "task_loss": 1.520774245262146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49753037095069885, + "epoch": 7.89, + "learning_rate": 1.0524091293322063e-05, + "loss": 0.3962, + "step": 9340, + "task_loss": 1.120042085647583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40975221991539, + "epoch": 7.9, + "learning_rate": 1.0519864750633981e-05, + "loss": 0.384, + "step": 9341, + "task_loss": 0.46020108461380005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22798791527748108, + "epoch": 7.9, + "learning_rate": 1.0515638207945901e-05, + "loss": 0.4763, + "step": 9342, + "task_loss": 0.4292354881763458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27462202310562134, + "epoch": 7.9, + "learning_rate": 1.051141166525782e-05, + "loss": 0.3773, + "step": 9343, + "task_loss": 0.36112216114997864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3133104741573334, + "epoch": 7.9, + "learning_rate": 1.0507185122569739e-05, + "loss": 0.4259, + "step": 9344, + "task_loss": 0.443113774061203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3287684917449951, + "epoch": 7.9, + "learning_rate": 1.0502958579881657e-05, + "loss": 0.3504, + "step": 9345, + "task_loss": 0.07360823452472687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29685765504837036, + "epoch": 7.9, + "learning_rate": 1.0498732037193577e-05, + "loss": 0.4172, + "step": 9346, + "task_loss": 0.8013165593147278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.306978315114975, + "epoch": 7.9, + "learning_rate": 1.0494505494505495e-05, + "loss": 0.3974, + "step": 9347, + "task_loss": 0.6382566094398499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3207901120185852, + "epoch": 7.9, + "learning_rate": 1.0490278951817414e-05, + "loss": 0.3409, + "step": 9348, + "task_loss": 1.107635259628296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19824336469173431, + "epoch": 7.9, + "learning_rate": 1.0486052409129333e-05, + "loss": 0.2463, + "step": 9349, + "task_loss": 0.5026869773864746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19186586141586304, + "epoch": 7.9, + "learning_rate": 1.048182586644125e-05, + "loss": 0.3904, + "step": 9350, + "task_loss": 0.43056154251098633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3297847509384155, + "epoch": 7.9, + "learning_rate": 1.047759932375317e-05, + "loss": 0.3672, + "step": 9351, + "task_loss": 0.3292813301086426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3019562363624573, + "epoch": 7.9, + "learning_rate": 1.047337278106509e-05, + "loss": 0.4057, + "step": 9352, + "task_loss": 0.2962067723274231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33273234963417053, + "epoch": 7.91, + "learning_rate": 1.0469146238377008e-05, + "loss": 0.3189, + "step": 9353, + "task_loss": 0.3907542824745178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2746749222278595, + "epoch": 7.91, + "learning_rate": 1.0464919695688926e-05, + "loss": 0.5712, + "step": 9354, + "task_loss": 0.7671428918838501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3933637738227844, + "epoch": 7.91, + "learning_rate": 1.0460693153000846e-05, + "loss": 0.4262, + "step": 9355, + "task_loss": 0.5202326774597168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3578820526599884, + "epoch": 7.91, + "learning_rate": 1.0456466610312766e-05, + "loss": 0.3935, + "step": 9356, + "task_loss": 0.7592084407806396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5406107902526855, + "epoch": 7.91, + "learning_rate": 1.0452240067624684e-05, + "loss": 0.4077, + "step": 9357, + "task_loss": 0.5751054286956787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2250983715057373, + "epoch": 7.91, + "learning_rate": 1.0448013524936602e-05, + "loss": 0.3505, + "step": 9358, + "task_loss": 0.14698077738285065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5378384590148926, + "epoch": 7.91, + "learning_rate": 1.0443786982248521e-05, + "loss": 0.4738, + "step": 9359, + "task_loss": 0.18897125124931335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4620664119720459, + "epoch": 7.91, + "learning_rate": 1.0439560439560441e-05, + "loss": 0.4251, + "step": 9360, + "task_loss": 0.6229583621025085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4064135253429413, + "epoch": 7.91, + "learning_rate": 1.0435333896872358e-05, + "loss": 0.3698, + "step": 9361, + "task_loss": 0.8910341858863831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1648712158203125, + "epoch": 7.91, + "learning_rate": 1.0431107354184277e-05, + "loss": 0.3465, + "step": 9362, + "task_loss": 0.6734543442726135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19563442468643188, + "epoch": 7.91, + "learning_rate": 1.0426880811496197e-05, + "loss": 0.2921, + "step": 9363, + "task_loss": 0.30024436116218567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3703368008136749, + "epoch": 7.91, + "learning_rate": 1.0422654268808115e-05, + "loss": 0.3696, + "step": 9364, + "task_loss": 0.2711583077907562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3399535119533539, + "epoch": 7.92, + "learning_rate": 1.0418427726120035e-05, + "loss": 0.2639, + "step": 9365, + "task_loss": 0.30582135915756226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35873204469680786, + "epoch": 7.92, + "learning_rate": 1.0414201183431953e-05, + "loss": 0.3214, + "step": 9366, + "task_loss": 0.668890118598938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3131570518016815, + "epoch": 7.92, + "learning_rate": 1.0409974640743873e-05, + "loss": 0.2867, + "step": 9367, + "task_loss": 0.2604713439941406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3699115812778473, + "epoch": 7.92, + "learning_rate": 1.040574809805579e-05, + "loss": 0.3903, + "step": 9368, + "task_loss": 0.6998211741447449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33093830943107605, + "epoch": 7.92, + "learning_rate": 1.040152155536771e-05, + "loss": 0.3774, + "step": 9369, + "task_loss": 0.6415210962295532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.237060546875, + "epoch": 7.92, + "learning_rate": 1.0397295012679628e-05, + "loss": 0.3896, + "step": 9370, + "task_loss": 0.546448290348053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4397224187850952, + "epoch": 7.92, + "learning_rate": 1.0393068469991548e-05, + "loss": 0.3842, + "step": 9371, + "task_loss": 0.7050089836120605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26140260696411133, + "epoch": 7.92, + "learning_rate": 1.0388841927303466e-05, + "loss": 0.329, + "step": 9372, + "task_loss": 0.23296354711055756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41678908467292786, + "epoch": 7.92, + "learning_rate": 1.0384615384615386e-05, + "loss": 0.3816, + "step": 9373, + "task_loss": 1.048256516456604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39260610938072205, + "epoch": 7.92, + "learning_rate": 1.0380388841927304e-05, + "loss": 0.3812, + "step": 9374, + "task_loss": 1.2521941661834717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4431672990322113, + "epoch": 7.92, + "learning_rate": 1.0376162299239222e-05, + "loss": 0.4888, + "step": 9375, + "task_loss": 0.5465877652168274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.56330406665802, + "epoch": 7.93, + "learning_rate": 1.0371935756551142e-05, + "loss": 0.4941, + "step": 9376, + "task_loss": 1.1811200380325317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3872709274291992, + "epoch": 7.93, + "learning_rate": 1.0367709213863061e-05, + "loss": 0.5543, + "step": 9377, + "task_loss": 0.3980772793292999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42802882194519043, + "epoch": 7.93, + "learning_rate": 1.036348267117498e-05, + "loss": 0.3673, + "step": 9378, + "task_loss": 0.4631832242012024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27570968866348267, + "epoch": 7.93, + "learning_rate": 1.0359256128486898e-05, + "loss": 0.3618, + "step": 9379, + "task_loss": 1.0667927265167236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2934645414352417, + "epoch": 7.93, + "learning_rate": 1.0355029585798817e-05, + "loss": 0.3589, + "step": 9380, + "task_loss": 1.0715514421463013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46468454599380493, + "epoch": 7.93, + "learning_rate": 1.0350803043110737e-05, + "loss": 0.3422, + "step": 9381, + "task_loss": 0.45326223969459534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36552029848098755, + "epoch": 7.93, + "learning_rate": 1.0346576500422653e-05, + "loss": 0.3657, + "step": 9382, + "task_loss": 0.6220387816429138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32779645919799805, + "epoch": 7.93, + "learning_rate": 1.0342349957734573e-05, + "loss": 0.3138, + "step": 9383, + "task_loss": 0.24285943806171417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3008289933204651, + "epoch": 7.93, + "learning_rate": 1.0338123415046493e-05, + "loss": 0.3984, + "step": 9384, + "task_loss": 0.6523308157920837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2053653746843338, + "epoch": 7.93, + "learning_rate": 1.0333896872358411e-05, + "loss": 0.3291, + "step": 9385, + "task_loss": 0.6391233801841736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2764323949813843, + "epoch": 7.93, + "learning_rate": 1.0329670329670329e-05, + "loss": 0.3431, + "step": 9386, + "task_loss": 0.8804179430007935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.58913254737854, + "epoch": 7.93, + "learning_rate": 1.0325443786982249e-05, + "loss": 0.3834, + "step": 9387, + "task_loss": 0.23427382111549377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6203899383544922, + "epoch": 7.94, + "learning_rate": 1.0321217244294168e-05, + "loss": 0.4805, + "step": 9388, + "task_loss": 0.8846639394760132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.353671133518219, + "epoch": 7.94, + "learning_rate": 1.0316990701606086e-05, + "loss": 0.3084, + "step": 9389, + "task_loss": 0.30331912636756897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25826042890548706, + "epoch": 7.94, + "learning_rate": 1.0312764158918005e-05, + "loss": 0.3345, + "step": 9390, + "task_loss": 0.4506385922431946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40124833583831787, + "epoch": 7.94, + "learning_rate": 1.0308537616229924e-05, + "loss": 0.3399, + "step": 9391, + "task_loss": 0.46779727935791016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.183677077293396, + "epoch": 7.94, + "learning_rate": 1.0304311073541844e-05, + "loss": 0.3317, + "step": 9392, + "task_loss": 0.4071427881717682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46816083788871765, + "epoch": 7.94, + "learning_rate": 1.0300084530853762e-05, + "loss": 0.4183, + "step": 9393, + "task_loss": 0.6161310076713562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22236217558383942, + "epoch": 7.94, + "learning_rate": 1.0295857988165682e-05, + "loss": 0.3654, + "step": 9394, + "task_loss": 0.42631030082702637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5708345174789429, + "epoch": 7.94, + "learning_rate": 1.02916314454776e-05, + "loss": 0.42, + "step": 9395, + "task_loss": 0.6909058690071106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46304288506507874, + "epoch": 7.94, + "learning_rate": 1.0287404902789518e-05, + "loss": 0.4389, + "step": 9396, + "task_loss": 0.8769976496696472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3119366765022278, + "epoch": 7.94, + "learning_rate": 1.0283178360101438e-05, + "loss": 0.2819, + "step": 9397, + "task_loss": 0.44786685705184937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35893315076828003, + "epoch": 7.94, + "learning_rate": 1.0278951817413357e-05, + "loss": 0.3177, + "step": 9398, + "task_loss": 0.8647464513778687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19748623669147491, + "epoch": 7.94, + "learning_rate": 1.0274725274725275e-05, + "loss": 0.3147, + "step": 9399, + "task_loss": 0.24764443933963776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6548997759819031, + "epoch": 7.95, + "learning_rate": 1.0270498732037193e-05, + "loss": 0.4937, + "step": 9400, + "task_loss": 0.5669742822647095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29760482907295227, + "epoch": 7.95, + "learning_rate": 1.0266272189349113e-05, + "loss": 0.3946, + "step": 9401, + "task_loss": 0.3779524564743042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31724005937576294, + "epoch": 7.95, + "learning_rate": 1.0262045646661033e-05, + "loss": 0.2924, + "step": 9402, + "task_loss": 0.329680472612381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3954460620880127, + "epoch": 7.95, + "learning_rate": 1.0257819103972951e-05, + "loss": 0.4274, + "step": 9403, + "task_loss": 0.7274028658866882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41761332750320435, + "epoch": 7.95, + "learning_rate": 1.0253592561284869e-05, + "loss": 0.3525, + "step": 9404, + "task_loss": 0.7435610294342041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4076928496360779, + "epoch": 7.95, + "learning_rate": 1.0249366018596789e-05, + "loss": 0.4311, + "step": 9405, + "task_loss": 0.7442341446876526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21257227659225464, + "epoch": 7.95, + "learning_rate": 1.0245139475908708e-05, + "loss": 0.289, + "step": 9406, + "task_loss": 0.09206889569759369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2921108901500702, + "epoch": 7.95, + "learning_rate": 1.0240912933220625e-05, + "loss": 0.3723, + "step": 9407, + "task_loss": 0.3296409547328949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2768428325653076, + "epoch": 7.95, + "learning_rate": 1.0236686390532545e-05, + "loss": 0.4623, + "step": 9408, + "task_loss": 1.0802311897277832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4046192169189453, + "epoch": 7.95, + "learning_rate": 1.0232459847844464e-05, + "loss": 0.4377, + "step": 9409, + "task_loss": 0.5437915921211243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27469420433044434, + "epoch": 7.95, + "learning_rate": 1.0228233305156382e-05, + "loss": 0.3587, + "step": 9410, + "task_loss": 0.22336819767951965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2558116912841797, + "epoch": 7.95, + "learning_rate": 1.02240067624683e-05, + "loss": 0.3903, + "step": 9411, + "task_loss": 0.0908508151769638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2631528973579407, + "epoch": 7.96, + "learning_rate": 1.021978021978022e-05, + "loss": 0.4251, + "step": 9412, + "task_loss": 0.45004695653915405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14367088675498962, + "epoch": 7.96, + "learning_rate": 1.021555367709214e-05, + "loss": 0.2798, + "step": 9413, + "task_loss": 0.2983837425708771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.354820191860199, + "epoch": 7.96, + "learning_rate": 1.0211327134404058e-05, + "loss": 0.4796, + "step": 9414, + "task_loss": 0.3660299777984619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3306651711463928, + "epoch": 7.96, + "learning_rate": 1.0207100591715976e-05, + "loss": 0.4183, + "step": 9415, + "task_loss": 0.7402946949005127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3089539706707001, + "epoch": 7.96, + "learning_rate": 1.0202874049027896e-05, + "loss": 0.3282, + "step": 9416, + "task_loss": 0.32596513628959656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2824006676673889, + "epoch": 7.96, + "learning_rate": 1.0198647506339814e-05, + "loss": 0.3203, + "step": 9417, + "task_loss": 0.32522502541542053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.332379549741745, + "epoch": 7.96, + "learning_rate": 1.0194420963651734e-05, + "loss": 0.4061, + "step": 9418, + "task_loss": 0.32181316614151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3873562812805176, + "epoch": 7.96, + "learning_rate": 1.0190194420963652e-05, + "loss": 0.556, + "step": 9419, + "task_loss": 0.2017328292131424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49662449955940247, + "epoch": 7.96, + "learning_rate": 1.0185967878275571e-05, + "loss": 0.4462, + "step": 9420, + "task_loss": 0.9196640253067017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5504055023193359, + "epoch": 7.96, + "learning_rate": 1.018174133558749e-05, + "loss": 0.3433, + "step": 9421, + "task_loss": 0.8239861726760864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32461249828338623, + "epoch": 7.96, + "learning_rate": 1.0177514792899409e-05, + "loss": 0.4561, + "step": 9422, + "task_loss": 1.59067964553833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40270233154296875, + "epoch": 7.96, + "learning_rate": 1.0173288250211329e-05, + "loss": 0.3899, + "step": 9423, + "task_loss": 1.4099159240722656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44141674041748047, + "epoch": 7.97, + "learning_rate": 1.0169061707523247e-05, + "loss": 0.3505, + "step": 9424, + "task_loss": 0.36235588788986206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3399415910243988, + "epoch": 7.97, + "learning_rate": 1.0164835164835165e-05, + "loss": 0.294, + "step": 9425, + "task_loss": 0.5679250359535217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19249096512794495, + "epoch": 7.97, + "learning_rate": 1.0160608622147085e-05, + "loss": 0.3718, + "step": 9426, + "task_loss": 0.5124766826629639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5217880606651306, + "epoch": 7.97, + "learning_rate": 1.0156382079459004e-05, + "loss": 0.5107, + "step": 9427, + "task_loss": 0.9031076431274414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3124064803123474, + "epoch": 7.97, + "learning_rate": 1.015215553677092e-05, + "loss": 0.3755, + "step": 9428, + "task_loss": 1.2427045106887817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5272301435470581, + "epoch": 7.97, + "learning_rate": 1.014792899408284e-05, + "loss": 0.4025, + "step": 9429, + "task_loss": 0.8686051964759827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28804540634155273, + "epoch": 7.97, + "learning_rate": 1.014370245139476e-05, + "loss": 0.4005, + "step": 9430, + "task_loss": 0.7669927477836609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24318541586399078, + "epoch": 7.97, + "learning_rate": 1.0139475908706678e-05, + "loss": 0.3925, + "step": 9431, + "task_loss": 0.2363303005695343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5863572955131531, + "epoch": 7.97, + "learning_rate": 1.0135249366018596e-05, + "loss": 0.4184, + "step": 9432, + "task_loss": 1.0201488733291626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4496966600418091, + "epoch": 7.97, + "learning_rate": 1.0131022823330516e-05, + "loss": 0.478, + "step": 9433, + "task_loss": 0.30431491136550903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3626857399940491, + "epoch": 7.97, + "learning_rate": 1.0126796280642436e-05, + "loss": 0.3859, + "step": 9434, + "task_loss": 0.6130858659744263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2506057620048523, + "epoch": 7.97, + "learning_rate": 1.0122569737954354e-05, + "loss": 0.5043, + "step": 9435, + "task_loss": 0.1877564936876297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3694946765899658, + "epoch": 7.98, + "learning_rate": 1.0118343195266272e-05, + "loss": 0.3978, + "step": 9436, + "task_loss": 0.7117792367935181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5303183197975159, + "epoch": 7.98, + "learning_rate": 1.0114116652578192e-05, + "loss": 0.4806, + "step": 9437, + "task_loss": 1.1630799770355225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35216695070266724, + "epoch": 7.98, + "learning_rate": 1.0109890109890111e-05, + "loss": 0.3244, + "step": 9438, + "task_loss": 0.6616082787513733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2138129025697708, + "epoch": 7.98, + "learning_rate": 1.010566356720203e-05, + "loss": 0.345, + "step": 9439, + "task_loss": 0.21335120499134064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20888842642307281, + "epoch": 7.98, + "learning_rate": 1.0101437024513947e-05, + "loss": 0.3096, + "step": 9440, + "task_loss": 0.348694771528244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31142765283584595, + "epoch": 7.98, + "learning_rate": 1.0097210481825867e-05, + "loss": 0.2904, + "step": 9441, + "task_loss": 0.6710964441299438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44097262620925903, + "epoch": 7.98, + "learning_rate": 1.0092983939137785e-05, + "loss": 0.529, + "step": 9442, + "task_loss": 0.39753127098083496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3438207507133484, + "epoch": 7.98, + "learning_rate": 1.0088757396449705e-05, + "loss": 0.348, + "step": 9443, + "task_loss": 0.642531156539917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38025474548339844, + "epoch": 7.98, + "learning_rate": 1.0084530853761623e-05, + "loss": 0.4213, + "step": 9444, + "task_loss": 0.33351853489875793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24770861864089966, + "epoch": 7.98, + "learning_rate": 1.0080304311073543e-05, + "loss": 0.3548, + "step": 9445, + "task_loss": 0.45884454250335693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33241963386535645, + "epoch": 7.98, + "learning_rate": 1.007607776838546e-05, + "loss": 0.356, + "step": 9446, + "task_loss": 0.6009525656700134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9162295460700989, + "epoch": 7.99, + "learning_rate": 1.007185122569738e-05, + "loss": 0.4755, + "step": 9447, + "task_loss": 0.8147220015525818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15189969539642334, + "epoch": 7.99, + "learning_rate": 1.0067624683009299e-05, + "loss": 0.3256, + "step": 9448, + "task_loss": 0.32074421644210815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4850310683250427, + "epoch": 7.99, + "learning_rate": 1.0063398140321217e-05, + "loss": 0.4105, + "step": 9449, + "task_loss": 0.40386563539505005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4874488115310669, + "epoch": 7.99, + "learning_rate": 1.0059171597633136e-05, + "loss": 0.414, + "step": 9450, + "task_loss": 0.5610076189041138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3008481562137604, + "epoch": 7.99, + "learning_rate": 1.0054945054945056e-05, + "loss": 0.3421, + "step": 9451, + "task_loss": 0.8644840121269226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2799399495124817, + "epoch": 7.99, + "learning_rate": 1.0050718512256974e-05, + "loss": 0.3506, + "step": 9452, + "task_loss": 0.3240704834461212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2638999819755554, + "epoch": 7.99, + "learning_rate": 1.0046491969568892e-05, + "loss": 0.32, + "step": 9453, + "task_loss": 0.24742238223552704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3064427375793457, + "epoch": 7.99, + "learning_rate": 1.0042265426880812e-05, + "loss": 0.3862, + "step": 9454, + "task_loss": 1.0167747735977173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25037920475006104, + "epoch": 7.99, + "learning_rate": 1.0038038884192732e-05, + "loss": 0.4079, + "step": 9455, + "task_loss": 0.9736714363098145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3613587021827698, + "epoch": 7.99, + "learning_rate": 1.003381234150465e-05, + "loss": 0.3458, + "step": 9456, + "task_loss": 0.6769360899925232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2978385090827942, + "epoch": 7.99, + "learning_rate": 1.0029585798816568e-05, + "loss": 0.4779, + "step": 9457, + "task_loss": 0.5593798160552979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30896925926208496, + "epoch": 7.99, + "learning_rate": 1.0025359256128487e-05, + "loss": 0.4081, + "step": 9458, + "task_loss": 0.6380171775817871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.693453848361969, + "epoch": 8.0, + "learning_rate": 1.0021132713440407e-05, + "loss": 0.3954, + "step": 9459, + "task_loss": 0.6205152273178101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2568035125732422, + "epoch": 8.0, + "learning_rate": 1.0016906170752325e-05, + "loss": 0.3991, + "step": 9460, + "task_loss": 0.882075846195221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5914914608001709, + "epoch": 8.0, + "learning_rate": 1.0012679628064243e-05, + "loss": 0.3545, + "step": 9461, + "task_loss": 0.5765284895896912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4754551649093628, + "epoch": 8.0, + "learning_rate": 1.0008453085376163e-05, + "loss": 0.4868, + "step": 9462, + "task_loss": 0.3565811812877655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5446940660476685, + "epoch": 8.0, + "learning_rate": 1.0004226542688081e-05, + "loss": 0.394, + "step": 9463, + "task_loss": 0.4381572902202606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30494987964630127, + "epoch": 8.0, + "learning_rate": 1e-05, + "loss": 0.261, + "step": 9464, + "task_loss": 0.38234224915504456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3635275959968567, + "epoch": 8.0, + "learning_rate": 9.995773457311919e-06, + "loss": 0.7953, + "step": 9465, + "task_loss": 0.5007911324501038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3265039026737213, + "epoch": 8.0, + "learning_rate": 9.991546914623839e-06, + "loss": 0.3029, + "step": 9466, + "task_loss": 0.2844346761703491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26586154103279114, + "epoch": 8.0, + "learning_rate": 9.987320371935757e-06, + "loss": 0.2748, + "step": 9467, + "task_loss": 0.5997502207756042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3282037377357483, + "epoch": 8.0, + "learning_rate": 9.983093829247676e-06, + "loss": 0.3883, + "step": 9468, + "task_loss": 1.3357172012329102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27700817584991455, + "epoch": 8.0, + "learning_rate": 9.978867286559594e-06, + "loss": 0.4622, + "step": 9469, + "task_loss": 0.8319427371025085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5357617735862732, + "epoch": 8.01, + "learning_rate": 9.974640743871514e-06, + "loss": 0.3959, + "step": 9470, + "task_loss": 1.032879114151001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4563886523246765, + "epoch": 8.01, + "learning_rate": 9.970414201183432e-06, + "loss": 0.3359, + "step": 9471, + "task_loss": 0.10449904203414917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3475956916809082, + "epoch": 8.01, + "learning_rate": 9.966187658495352e-06, + "loss": 0.4776, + "step": 9472, + "task_loss": 0.39387616515159607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42351025342941284, + "epoch": 8.01, + "learning_rate": 9.96196111580727e-06, + "loss": 0.4005, + "step": 9473, + "task_loss": 0.3764391839504242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4575378894805908, + "epoch": 8.01, + "learning_rate": 9.957734573119188e-06, + "loss": 0.3436, + "step": 9474, + "task_loss": 0.26252058148384094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2280004620552063, + "epoch": 8.01, + "learning_rate": 9.953508030431108e-06, + "loss": 0.3761, + "step": 9475, + "task_loss": 0.14729619026184082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.50696861743927, + "epoch": 8.01, + "learning_rate": 9.949281487743028e-06, + "loss": 0.3973, + "step": 9476, + "task_loss": 1.009718656539917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1829715520143509, + "epoch": 8.01, + "learning_rate": 9.945054945054946e-06, + "loss": 0.3651, + "step": 9477, + "task_loss": 0.14630205929279327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.456539124250412, + "epoch": 8.01, + "learning_rate": 9.940828402366864e-06, + "loss": 0.3854, + "step": 9478, + "task_loss": 0.4401250183582306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2648431658744812, + "epoch": 8.01, + "learning_rate": 9.936601859678783e-06, + "loss": 0.3288, + "step": 9479, + "task_loss": 0.04418956860899925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4458216428756714, + "epoch": 8.01, + "learning_rate": 9.932375316990703e-06, + "loss": 0.4529, + "step": 9480, + "task_loss": 0.3048208951950073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39277467131614685, + "epoch": 8.01, + "learning_rate": 9.92814877430262e-06, + "loss": 0.4193, + "step": 9481, + "task_loss": 0.4199518859386444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1766292303800583, + "epoch": 8.02, + "learning_rate": 9.92392223161454e-06, + "loss": 0.3768, + "step": 9482, + "task_loss": 0.22615107893943787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47020259499549866, + "epoch": 8.02, + "learning_rate": 9.919695688926459e-06, + "loss": 0.3335, + "step": 9483, + "task_loss": 0.14356809854507446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26510873436927795, + "epoch": 8.02, + "learning_rate": 9.915469146238379e-06, + "loss": 0.2679, + "step": 9484, + "task_loss": 0.1624196171760559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28390875458717346, + "epoch": 8.02, + "learning_rate": 9.911242603550297e-06, + "loss": 0.4356, + "step": 9485, + "task_loss": 0.4377221167087555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2791476547718048, + "epoch": 8.02, + "learning_rate": 9.907016060862215e-06, + "loss": 0.4166, + "step": 9486, + "task_loss": 1.0479180812835693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3591724634170532, + "epoch": 8.02, + "learning_rate": 9.902789518174134e-06, + "loss": 0.4586, + "step": 9487, + "task_loss": 0.7510116696357727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3330943286418915, + "epoch": 8.02, + "learning_rate": 9.898562975486053e-06, + "loss": 0.3825, + "step": 9488, + "task_loss": 0.8075860142707825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37684503197669983, + "epoch": 8.02, + "learning_rate": 9.894336432797972e-06, + "loss": 0.3881, + "step": 9489, + "task_loss": 0.949891984462738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4541041851043701, + "epoch": 8.02, + "learning_rate": 9.89010989010989e-06, + "loss": 0.3906, + "step": 9490, + "task_loss": 0.6921858787536621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6335106492042542, + "epoch": 8.02, + "learning_rate": 9.88588334742181e-06, + "loss": 0.4252, + "step": 9491, + "task_loss": 1.0319186449050903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4646127223968506, + "epoch": 8.02, + "learning_rate": 9.881656804733728e-06, + "loss": 0.4588, + "step": 9492, + "task_loss": 1.4515480995178223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3685835301876068, + "epoch": 8.02, + "learning_rate": 9.877430262045648e-06, + "loss": 0.3323, + "step": 9493, + "task_loss": 0.16782687604427338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.400140643119812, + "epoch": 8.03, + "learning_rate": 9.873203719357566e-06, + "loss": 0.407, + "step": 9494, + "task_loss": 0.383005291223526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4261009097099304, + "epoch": 8.03, + "learning_rate": 9.868977176669484e-06, + "loss": 0.3872, + "step": 9495, + "task_loss": 0.805411696434021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5084391236305237, + "epoch": 8.03, + "learning_rate": 9.864750633981404e-06, + "loss": 0.4502, + "step": 9496, + "task_loss": 0.31891733407974243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30456095933914185, + "epoch": 8.03, + "learning_rate": 9.860524091293323e-06, + "loss": 0.3274, + "step": 9497, + "task_loss": 0.3608904480934143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2846469283103943, + "epoch": 8.03, + "learning_rate": 9.856297548605241e-06, + "loss": 0.4728, + "step": 9498, + "task_loss": 0.7982349395751953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3022933602333069, + "epoch": 8.03, + "learning_rate": 9.85207100591716e-06, + "loss": 0.4673, + "step": 9499, + "task_loss": 0.7299861907958984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40707725286483765, + "epoch": 8.03, + "learning_rate": 9.84784446322908e-06, + "loss": 0.4163, + "step": 9500, + "task_loss": 0.5602826476097107 + }, + { + "epoch": 8.03, + "eval_accuracy": 0.9165544554455446, + "eval_loss": 0.2582683563232422, + "eval_runtime": 226.4874, + "eval_samples_per_second": 111.485, + "eval_steps_per_second": 0.874, + "step": 9500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40723666548728943, + "epoch": 8.03, + "learning_rate": 9.843617920540999e-06, + "loss": 0.3456, + "step": 9501, + "task_loss": 0.303974449634552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3229803740978241, + "epoch": 8.03, + "learning_rate": 9.839391377852917e-06, + "loss": 0.383, + "step": 9502, + "task_loss": 0.8068796396255493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37747031450271606, + "epoch": 8.03, + "learning_rate": 9.835164835164835e-06, + "loss": 0.3836, + "step": 9503, + "task_loss": 1.0041242837905884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39161014556884766, + "epoch": 8.03, + "learning_rate": 9.830938292476755e-06, + "loss": 0.3756, + "step": 9504, + "task_loss": 0.7075904607772827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17666351795196533, + "epoch": 8.03, + "learning_rate": 9.826711749788675e-06, + "loss": 0.358, + "step": 9505, + "task_loss": 0.3021080195903778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.272285521030426, + "epoch": 8.04, + "learning_rate": 9.822485207100591e-06, + "loss": 0.3603, + "step": 9506, + "task_loss": 0.40746957063674927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33424729108810425, + "epoch": 8.04, + "learning_rate": 9.81825866441251e-06, + "loss": 0.4974, + "step": 9507, + "task_loss": 0.7196852564811707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49253612756729126, + "epoch": 8.04, + "learning_rate": 9.81403212172443e-06, + "loss": 0.4067, + "step": 9508, + "task_loss": 0.6764450073242188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30621182918548584, + "epoch": 8.04, + "learning_rate": 9.809805579036348e-06, + "loss": 0.3965, + "step": 9509, + "task_loss": 0.6441108584403992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5450552105903625, + "epoch": 8.04, + "learning_rate": 9.805579036348266e-06, + "loss": 0.4649, + "step": 9510, + "task_loss": 0.934654951095581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34476178884506226, + "epoch": 8.04, + "learning_rate": 9.801352493660186e-06, + "loss": 0.3365, + "step": 9511, + "task_loss": 0.5674360990524292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3293823003768921, + "epoch": 8.04, + "learning_rate": 9.797125950972106e-06, + "loss": 0.3946, + "step": 9512, + "task_loss": 0.36159422993659973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32382678985595703, + "epoch": 8.04, + "learning_rate": 9.792899408284024e-06, + "loss": 0.3184, + "step": 9513, + "task_loss": 1.3337057828903198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21925796568393707, + "epoch": 8.04, + "learning_rate": 9.788672865595944e-06, + "loss": 0.343, + "step": 9514, + "task_loss": 0.23473528027534485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29959332942962646, + "epoch": 8.04, + "learning_rate": 9.784446322907862e-06, + "loss": 0.3779, + "step": 9515, + "task_loss": 1.0600448846817017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2582482695579529, + "epoch": 8.04, + "learning_rate": 9.78021978021978e-06, + "loss": 0.3906, + "step": 9516, + "task_loss": 0.4543503522872925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23099832236766815, + "epoch": 8.04, + "learning_rate": 9.7759932375317e-06, + "loss": 0.3231, + "step": 9517, + "task_loss": 0.4425613284111023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4333673119544983, + "epoch": 8.05, + "learning_rate": 9.77176669484362e-06, + "loss": 0.3638, + "step": 9518, + "task_loss": 0.5323060750961304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38231539726257324, + "epoch": 8.05, + "learning_rate": 9.767540152155537e-06, + "loss": 0.3196, + "step": 9519, + "task_loss": 0.6407788395881653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4929042160511017, + "epoch": 8.05, + "learning_rate": 9.763313609467455e-06, + "loss": 0.4189, + "step": 9520, + "task_loss": 0.49906104803085327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3395448923110962, + "epoch": 8.05, + "learning_rate": 9.759087066779375e-06, + "loss": 0.4829, + "step": 9521, + "task_loss": 0.20274889469146729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37314921617507935, + "epoch": 8.05, + "learning_rate": 9.754860524091295e-06, + "loss": 0.5312, + "step": 9522, + "task_loss": 0.3446331322193146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25583717226982117, + "epoch": 8.05, + "learning_rate": 9.750633981403213e-06, + "loss": 0.3544, + "step": 9523, + "task_loss": 0.1991509646177292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23117810487747192, + "epoch": 8.05, + "learning_rate": 9.746407438715131e-06, + "loss": 0.3681, + "step": 9524, + "task_loss": 0.18167535960674286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24815919995307922, + "epoch": 8.05, + "learning_rate": 9.74218089602705e-06, + "loss": 0.3838, + "step": 9525, + "task_loss": 0.41123390197753906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21954123675823212, + "epoch": 8.05, + "learning_rate": 9.73795435333897e-06, + "loss": 0.2999, + "step": 9526, + "task_loss": 0.14730186760425568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42896896600723267, + "epoch": 8.05, + "learning_rate": 9.733727810650887e-06, + "loss": 0.3161, + "step": 9527, + "task_loss": 1.0610079765319824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4465399384498596, + "epoch": 8.05, + "learning_rate": 9.729501267962806e-06, + "loss": 0.3657, + "step": 9528, + "task_loss": 0.5230749845504761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6194783449172974, + "epoch": 8.05, + "learning_rate": 9.725274725274726e-06, + "loss": 0.4487, + "step": 9529, + "task_loss": 0.8953304290771484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4146515727043152, + "epoch": 8.06, + "learning_rate": 9.721048182586644e-06, + "loss": 0.4122, + "step": 9530, + "task_loss": 0.6930051445960999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5738439559936523, + "epoch": 8.06, + "learning_rate": 9.716821639898562e-06, + "loss": 0.3722, + "step": 9531, + "task_loss": 0.5267540812492371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3677220940589905, + "epoch": 8.06, + "learning_rate": 9.712595097210482e-06, + "loss": 0.2843, + "step": 9532, + "task_loss": 0.3634708523750305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4567508101463318, + "epoch": 8.06, + "learning_rate": 9.708368554522402e-06, + "loss": 0.3547, + "step": 9533, + "task_loss": 0.6932446360588074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4244416654109955, + "epoch": 8.06, + "learning_rate": 9.70414201183432e-06, + "loss": 0.4385, + "step": 9534, + "task_loss": 0.5224007964134216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4627702832221985, + "epoch": 8.06, + "learning_rate": 9.699915469146238e-06, + "loss": 0.3477, + "step": 9535, + "task_loss": 0.3209187090396881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3206583559513092, + "epoch": 8.06, + "learning_rate": 9.695688926458158e-06, + "loss": 0.3176, + "step": 9536, + "task_loss": 0.09001386910676956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17339655756950378, + "epoch": 8.06, + "learning_rate": 9.691462383770077e-06, + "loss": 0.3186, + "step": 9537, + "task_loss": 0.2191508412361145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5378230214118958, + "epoch": 8.06, + "learning_rate": 9.687235841081995e-06, + "loss": 0.3741, + "step": 9538, + "task_loss": 0.908126950263977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3767419159412384, + "epoch": 8.06, + "learning_rate": 9.683009298393913e-06, + "loss": 0.3763, + "step": 9539, + "task_loss": 0.590830385684967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3942040205001831, + "epoch": 8.06, + "learning_rate": 9.678782755705833e-06, + "loss": 0.4315, + "step": 9540, + "task_loss": 0.3798633813858032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39076346158981323, + "epoch": 8.07, + "learning_rate": 9.674556213017751e-06, + "loss": 0.4078, + "step": 9541, + "task_loss": 0.6101036071777344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21464087069034576, + "epoch": 8.07, + "learning_rate": 9.670329670329671e-06, + "loss": 0.3844, + "step": 9542, + "task_loss": 1.1720894575119019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3350306749343872, + "epoch": 8.07, + "learning_rate": 9.66610312764159e-06, + "loss": 0.3873, + "step": 9543, + "task_loss": 0.4297977387905121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47340577840805054, + "epoch": 8.07, + "learning_rate": 9.661876584953509e-06, + "loss": 0.3778, + "step": 9544, + "task_loss": 0.25794732570648193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5419970750808716, + "epoch": 8.07, + "learning_rate": 9.657650042265427e-06, + "loss": 0.3778, + "step": 9545, + "task_loss": 0.5469459891319275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45987582206726074, + "epoch": 8.07, + "learning_rate": 9.653423499577347e-06, + "loss": 0.3318, + "step": 9546, + "task_loss": 1.442956805229187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4366358518600464, + "epoch": 8.07, + "learning_rate": 9.649196956889266e-06, + "loss": 0.5575, + "step": 9547, + "task_loss": 0.6381012201309204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4083651900291443, + "epoch": 8.07, + "learning_rate": 9.644970414201183e-06, + "loss": 0.4026, + "step": 9548, + "task_loss": 0.4918254315853119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6344305276870728, + "epoch": 8.07, + "learning_rate": 9.640743871513102e-06, + "loss": 0.4557, + "step": 9549, + "task_loss": 0.7844622135162354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2742043137550354, + "epoch": 8.07, + "learning_rate": 9.636517328825022e-06, + "loss": 0.3116, + "step": 9550, + "task_loss": 0.43593645095825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.316173791885376, + "epoch": 8.07, + "learning_rate": 9.632290786136942e-06, + "loss": 0.3362, + "step": 9551, + "task_loss": 0.2711165249347687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37108156085014343, + "epoch": 8.07, + "learning_rate": 9.628064243448858e-06, + "loss": 0.3363, + "step": 9552, + "task_loss": 0.6358299255371094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2919468879699707, + "epoch": 8.08, + "learning_rate": 9.623837700760778e-06, + "loss": 0.3776, + "step": 9553, + "task_loss": 0.7901868224143982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5201333165168762, + "epoch": 8.08, + "learning_rate": 9.619611158072698e-06, + "loss": 0.4396, + "step": 9554, + "task_loss": 0.39068761467933655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3648889660835266, + "epoch": 8.08, + "learning_rate": 9.615384615384616e-06, + "loss": 0.4203, + "step": 9555, + "task_loss": 1.0932284593582153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4836452603340149, + "epoch": 8.08, + "learning_rate": 9.611158072696534e-06, + "loss": 0.389, + "step": 9556, + "task_loss": 0.7916218042373657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27200859785079956, + "epoch": 8.08, + "learning_rate": 9.606931530008453e-06, + "loss": 0.2992, + "step": 9557, + "task_loss": 0.23351293802261353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2762584686279297, + "epoch": 8.08, + "learning_rate": 9.602704987320373e-06, + "loss": 0.344, + "step": 9558, + "task_loss": 0.2612742781639099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46028584241867065, + "epoch": 8.08, + "learning_rate": 9.598478444632291e-06, + "loss": 0.4589, + "step": 9559, + "task_loss": 0.4408276677131653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2632410526275635, + "epoch": 8.08, + "learning_rate": 9.59425190194421e-06, + "loss": 0.2922, + "step": 9560, + "task_loss": 0.4985095262527466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.492450475692749, + "epoch": 8.08, + "learning_rate": 9.590025359256129e-06, + "loss": 0.3553, + "step": 9561, + "task_loss": 0.42737436294555664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2609958350658417, + "epoch": 8.08, + "learning_rate": 9.585798816568047e-06, + "loss": 0.4056, + "step": 9562, + "task_loss": 0.6591688990592957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23907506465911865, + "epoch": 8.08, + "learning_rate": 9.581572273879967e-06, + "loss": 0.3425, + "step": 9563, + "task_loss": 0.7152498960494995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24413537979125977, + "epoch": 8.08, + "learning_rate": 9.577345731191885e-06, + "loss": 0.3215, + "step": 9564, + "task_loss": 0.31466352939605713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5873289704322815, + "epoch": 8.09, + "learning_rate": 9.573119188503805e-06, + "loss": 0.5086, + "step": 9565, + "task_loss": 0.6502830982208252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15625923871994019, + "epoch": 8.09, + "learning_rate": 9.568892645815723e-06, + "loss": 0.4039, + "step": 9566, + "task_loss": 0.03125901147723198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30555737018585205, + "epoch": 8.09, + "learning_rate": 9.564666103127642e-06, + "loss": 0.3101, + "step": 9567, + "task_loss": 0.6276337504386902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6364552974700928, + "epoch": 8.09, + "learning_rate": 9.56043956043956e-06, + "loss": 0.426, + "step": 9568, + "task_loss": 1.0174585580825806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4175097644329071, + "epoch": 8.09, + "learning_rate": 9.55621301775148e-06, + "loss": 0.4443, + "step": 9569, + "task_loss": 1.0049359798431396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6983851790428162, + "epoch": 8.09, + "learning_rate": 9.551986475063398e-06, + "loss": 0.479, + "step": 9570, + "task_loss": 0.677760660648346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40620875358581543, + "epoch": 8.09, + "learning_rate": 9.547759932375318e-06, + "loss": 0.2764, + "step": 9571, + "task_loss": 0.9887720942497253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3389171063899994, + "epoch": 8.09, + "learning_rate": 9.543533389687238e-06, + "loss": 0.2897, + "step": 9572, + "task_loss": 0.5915806293487549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7092840671539307, + "epoch": 8.09, + "learning_rate": 9.539306846999154e-06, + "loss": 0.323, + "step": 9573, + "task_loss": 1.0367742776870728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3712989389896393, + "epoch": 8.09, + "learning_rate": 9.535080304311074e-06, + "loss": 0.6249, + "step": 9574, + "task_loss": 0.333400160074234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40600138902664185, + "epoch": 8.09, + "learning_rate": 9.530853761622994e-06, + "loss": 0.3419, + "step": 9575, + "task_loss": 0.649448812007904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27573418617248535, + "epoch": 8.09, + "learning_rate": 9.526627218934912e-06, + "loss": 0.3511, + "step": 9576, + "task_loss": 1.095445156097412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30102992057800293, + "epoch": 8.1, + "learning_rate": 9.52240067624683e-06, + "loss": 0.3486, + "step": 9577, + "task_loss": 0.22039470076560974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3979472219944, + "epoch": 8.1, + "learning_rate": 9.51817413355875e-06, + "loss": 0.3315, + "step": 9578, + "task_loss": 0.41883575916290283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6579056978225708, + "epoch": 8.1, + "learning_rate": 9.513947590870669e-06, + "loss": 0.4493, + "step": 9579, + "task_loss": 0.9873151183128357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.50331050157547, + "epoch": 8.1, + "learning_rate": 9.509721048182587e-06, + "loss": 0.2954, + "step": 9580, + "task_loss": 0.2561885118484497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17993547022342682, + "epoch": 8.1, + "learning_rate": 9.505494505494505e-06, + "loss": 0.4296, + "step": 9581, + "task_loss": 0.47484517097473145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1356126070022583, + "epoch": 8.1, + "learning_rate": 9.501267962806425e-06, + "loss": 0.2887, + "step": 9582, + "task_loss": 0.5511762499809265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38288500905036926, + "epoch": 8.1, + "learning_rate": 9.497041420118345e-06, + "loss": 0.3238, + "step": 9583, + "task_loss": 0.6213030815124512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6185067892074585, + "epoch": 8.1, + "learning_rate": 9.492814877430263e-06, + "loss": 0.484, + "step": 9584, + "task_loss": 0.45754849910736084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24267931282520294, + "epoch": 8.1, + "learning_rate": 9.48858833474218e-06, + "loss": 0.3728, + "step": 9585, + "task_loss": 0.8542752861976624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25541970133781433, + "epoch": 8.1, + "learning_rate": 9.4843617920541e-06, + "loss": 0.3445, + "step": 9586, + "task_loss": 0.3271564245223999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42707839608192444, + "epoch": 8.1, + "learning_rate": 9.480135249366019e-06, + "loss": 0.433, + "step": 9587, + "task_loss": 0.9447238445281982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26518845558166504, + "epoch": 8.1, + "learning_rate": 9.475908706677938e-06, + "loss": 0.3891, + "step": 9588, + "task_loss": 0.30231887102127075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2763517498970032, + "epoch": 8.11, + "learning_rate": 9.471682163989856e-06, + "loss": 0.3606, + "step": 9589, + "task_loss": 0.38724765181541443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3539882004261017, + "epoch": 8.11, + "learning_rate": 9.467455621301776e-06, + "loss": 0.4567, + "step": 9590, + "task_loss": 0.6587515473365784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3378356993198395, + "epoch": 8.11, + "learning_rate": 9.463229078613694e-06, + "loss": 0.4536, + "step": 9591, + "task_loss": 0.8763216137886047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35996103286743164, + "epoch": 8.11, + "learning_rate": 9.459002535925614e-06, + "loss": 0.3752, + "step": 9592, + "task_loss": 1.6664866209030151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3141550123691559, + "epoch": 8.11, + "learning_rate": 9.454775993237532e-06, + "loss": 0.4192, + "step": 9593, + "task_loss": 0.5907471179962158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.316249817609787, + "epoch": 8.11, + "learning_rate": 9.45054945054945e-06, + "loss": 0.3713, + "step": 9594, + "task_loss": 0.2997492253780365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5545806288719177, + "epoch": 8.11, + "learning_rate": 9.44632290786137e-06, + "loss": 0.4512, + "step": 9595, + "task_loss": 1.0716569423675537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38335737586021423, + "epoch": 8.11, + "learning_rate": 9.44209636517329e-06, + "loss": 0.3363, + "step": 9596, + "task_loss": 0.36720141768455505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3329651951789856, + "epoch": 8.11, + "learning_rate": 9.437869822485207e-06, + "loss": 0.2881, + "step": 9597, + "task_loss": 0.6670209169387817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5699474811553955, + "epoch": 8.11, + "learning_rate": 9.433643279797126e-06, + "loss": 0.4345, + "step": 9598, + "task_loss": 0.3198203146457672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5423785448074341, + "epoch": 8.11, + "learning_rate": 9.429416737109045e-06, + "loss": 0.5017, + "step": 9599, + "task_loss": 1.515604019165039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34926584362983704, + "epoch": 8.11, + "learning_rate": 9.425190194420965e-06, + "loss": 0.4044, + "step": 9600, + "task_loss": 1.0147638320922852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4368707835674286, + "epoch": 8.12, + "learning_rate": 9.420963651732883e-06, + "loss": 0.3993, + "step": 9601, + "task_loss": 1.1315109729766846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36856797337532043, + "epoch": 8.12, + "learning_rate": 9.416737109044801e-06, + "loss": 0.3545, + "step": 9602, + "task_loss": 0.6522079110145569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3776322901248932, + "epoch": 8.12, + "learning_rate": 9.41251056635672e-06, + "loss": 0.5057, + "step": 9603, + "task_loss": 1.1521193981170654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24178169667720795, + "epoch": 8.12, + "learning_rate": 9.40828402366864e-06, + "loss": 0.342, + "step": 9604, + "task_loss": 0.25959545373916626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5928804278373718, + "epoch": 8.12, + "learning_rate": 9.404057480980559e-06, + "loss": 0.5401, + "step": 9605, + "task_loss": 1.4422557353973389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34911155700683594, + "epoch": 8.12, + "learning_rate": 9.399830938292477e-06, + "loss": 0.3286, + "step": 9606, + "task_loss": 0.33856016397476196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5973302721977234, + "epoch": 8.12, + "learning_rate": 9.395604395604396e-06, + "loss": 0.4587, + "step": 9607, + "task_loss": 0.3795664608478546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3140881061553955, + "epoch": 8.12, + "learning_rate": 9.391377852916314e-06, + "loss": 0.3214, + "step": 9608, + "task_loss": 0.20383039116859436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2899821102619171, + "epoch": 8.12, + "learning_rate": 9.387151310228234e-06, + "loss": 0.3746, + "step": 9609, + "task_loss": 0.1717749387025833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5864571332931519, + "epoch": 8.12, + "learning_rate": 9.382924767540152e-06, + "loss": 0.3729, + "step": 9610, + "task_loss": 0.9384562969207764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33046454191207886, + "epoch": 8.12, + "learning_rate": 9.378698224852072e-06, + "loss": 0.2951, + "step": 9611, + "task_loss": 0.7030954957008362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29536837339401245, + "epoch": 8.13, + "learning_rate": 9.37447168216399e-06, + "loss": 0.5172, + "step": 9612, + "task_loss": 0.6117832660675049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2744894027709961, + "epoch": 8.13, + "learning_rate": 9.37024513947591e-06, + "loss": 0.3147, + "step": 9613, + "task_loss": 0.4742885231971741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2914201021194458, + "epoch": 8.13, + "learning_rate": 9.366018596787828e-06, + "loss": 0.3221, + "step": 9614, + "task_loss": 1.2450988292694092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3952803313732147, + "epoch": 8.13, + "learning_rate": 9.361792054099748e-06, + "loss": 0.3611, + "step": 9615, + "task_loss": 0.5938776135444641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28048014640808105, + "epoch": 8.13, + "learning_rate": 9.357565511411666e-06, + "loss": 0.3374, + "step": 9616, + "task_loss": 1.7317402362823486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.380979061126709, + "epoch": 8.13, + "learning_rate": 9.353338968723585e-06, + "loss": 0.4017, + "step": 9617, + "task_loss": 0.19807791709899902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3415282368659973, + "epoch": 8.13, + "learning_rate": 9.349112426035503e-06, + "loss": 0.2835, + "step": 9618, + "task_loss": 0.6596552729606628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3305451273918152, + "epoch": 8.13, + "learning_rate": 9.344885883347421e-06, + "loss": 0.4564, + "step": 9619, + "task_loss": 1.1519514322280884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6515567898750305, + "epoch": 8.13, + "learning_rate": 9.340659340659341e-06, + "loss": 0.3777, + "step": 9620, + "task_loss": 0.9401289224624634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24718138575553894, + "epoch": 8.13, + "learning_rate": 9.336432797971261e-06, + "loss": 0.3209, + "step": 9621, + "task_loss": 1.3335468769073486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4232349395751953, + "epoch": 8.13, + "learning_rate": 9.332206255283179e-06, + "loss": 0.4646, + "step": 9622, + "task_loss": 0.727961540222168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45962831377983093, + "epoch": 8.13, + "learning_rate": 9.327979712595097e-06, + "loss": 0.4045, + "step": 9623, + "task_loss": 0.4596060812473297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3557490110397339, + "epoch": 8.14, + "learning_rate": 9.323753169907017e-06, + "loss": 0.3699, + "step": 9624, + "task_loss": 1.2663525342941284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2863536477088928, + "epoch": 8.14, + "learning_rate": 9.319526627218936e-06, + "loss": 0.3268, + "step": 9625, + "task_loss": 0.09632904082536697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22954314947128296, + "epoch": 8.14, + "learning_rate": 9.315300084530853e-06, + "loss": 0.2577, + "step": 9626, + "task_loss": 0.15313583612442017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3341369032859802, + "epoch": 8.14, + "learning_rate": 9.311073541842773e-06, + "loss": 0.4661, + "step": 9627, + "task_loss": 0.893329381942749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26209360361099243, + "epoch": 8.14, + "learning_rate": 9.306846999154692e-06, + "loss": 0.405, + "step": 9628, + "task_loss": 0.04880192503333092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.528457760810852, + "epoch": 8.14, + "learning_rate": 9.30262045646661e-06, + "loss": 0.4759, + "step": 9629, + "task_loss": 1.0006113052368164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18417946994304657, + "epoch": 8.14, + "learning_rate": 9.298393913778528e-06, + "loss": 0.3162, + "step": 9630, + "task_loss": 0.18520507216453552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.270965576171875, + "epoch": 8.14, + "learning_rate": 9.294167371090448e-06, + "loss": 0.2886, + "step": 9631, + "task_loss": 0.11668547242879868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32560622692108154, + "epoch": 8.14, + "learning_rate": 9.289940828402368e-06, + "loss": 0.3888, + "step": 9632, + "task_loss": 0.3255392909049988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.09497303515672684, + "epoch": 8.14, + "learning_rate": 9.285714285714286e-06, + "loss": 0.348, + "step": 9633, + "task_loss": 0.03355012834072113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4106968939304352, + "epoch": 8.14, + "learning_rate": 9.281487743026206e-06, + "loss": 0.4313, + "step": 9634, + "task_loss": 1.1803704500198364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5191466808319092, + "epoch": 8.14, + "learning_rate": 9.277261200338124e-06, + "loss": 0.3892, + "step": 9635, + "task_loss": 0.6688359975814819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6053226590156555, + "epoch": 8.15, + "learning_rate": 9.273034657650043e-06, + "loss": 0.5255, + "step": 9636, + "task_loss": 0.32775962352752686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39070868492126465, + "epoch": 8.15, + "learning_rate": 9.268808114961961e-06, + "loss": 0.4261, + "step": 9637, + "task_loss": 0.3421628177165985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5107582807540894, + "epoch": 8.15, + "learning_rate": 9.264581572273881e-06, + "loss": 0.4467, + "step": 9638, + "task_loss": 0.6848765015602112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16719526052474976, + "epoch": 8.15, + "learning_rate": 9.2603550295858e-06, + "loss": 0.3027, + "step": 9639, + "task_loss": 0.02444460801780224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3925929665565491, + "epoch": 8.15, + "learning_rate": 9.256128486897717e-06, + "loss": 0.4696, + "step": 9640, + "task_loss": 0.3108866512775421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18128415942192078, + "epoch": 8.15, + "learning_rate": 9.251901944209637e-06, + "loss": 0.3648, + "step": 9641, + "task_loss": 0.47694361209869385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36158427596092224, + "epoch": 8.15, + "learning_rate": 9.247675401521557e-06, + "loss": 0.4199, + "step": 9642, + "task_loss": 0.527518630027771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.613684892654419, + "epoch": 8.15, + "learning_rate": 9.243448858833475e-06, + "loss": 0.3951, + "step": 9643, + "task_loss": 1.2893481254577637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6827993988990784, + "epoch": 8.15, + "learning_rate": 9.239222316145393e-06, + "loss": 0.3756, + "step": 9644, + "task_loss": 0.5007761716842651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33941271901130676, + "epoch": 8.15, + "learning_rate": 9.234995773457313e-06, + "loss": 0.4091, + "step": 9645, + "task_loss": 0.48540809750556946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31250306963920593, + "epoch": 8.15, + "learning_rate": 9.230769230769232e-06, + "loss": 0.3187, + "step": 9646, + "task_loss": 0.24228867888450623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33825749158859253, + "epoch": 8.15, + "learning_rate": 9.22654268808115e-06, + "loss": 0.5415, + "step": 9647, + "task_loss": 1.082698106765747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3332681953907013, + "epoch": 8.16, + "learning_rate": 9.222316145393068e-06, + "loss": 0.3263, + "step": 9648, + "task_loss": 1.0833981037139893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6021872758865356, + "epoch": 8.16, + "learning_rate": 9.218089602704988e-06, + "loss": 0.5312, + "step": 9649, + "task_loss": 0.14616775512695312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.344882994890213, + "epoch": 8.16, + "learning_rate": 9.213863060016908e-06, + "loss": 0.4013, + "step": 9650, + "task_loss": 0.4697834849357605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3713876008987427, + "epoch": 8.16, + "learning_rate": 9.209636517328824e-06, + "loss": 0.3729, + "step": 9651, + "task_loss": 0.9847646951675415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5445247888565063, + "epoch": 8.16, + "learning_rate": 9.205409974640744e-06, + "loss": 0.34, + "step": 9652, + "task_loss": 0.3472640812397003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31654882431030273, + "epoch": 8.16, + "learning_rate": 9.201183431952664e-06, + "loss": 0.4552, + "step": 9653, + "task_loss": 0.35890087485313416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4858705401420593, + "epoch": 8.16, + "learning_rate": 9.196956889264582e-06, + "loss": 0.3702, + "step": 9654, + "task_loss": 0.8788583874702454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27804529666900635, + "epoch": 8.16, + "learning_rate": 9.1927303465765e-06, + "loss": 0.3869, + "step": 9655, + "task_loss": 0.6036635041236877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5971202850341797, + "epoch": 8.16, + "learning_rate": 9.18850380388842e-06, + "loss": 0.436, + "step": 9656, + "task_loss": 0.9230960011482239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47867780923843384, + "epoch": 8.16, + "learning_rate": 9.18427726120034e-06, + "loss": 0.3747, + "step": 9657, + "task_loss": 1.0159308910369873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6197766661643982, + "epoch": 8.16, + "learning_rate": 9.180050718512257e-06, + "loss": 0.3975, + "step": 9658, + "task_loss": 0.32422152161598206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2516373097896576, + "epoch": 8.16, + "learning_rate": 9.175824175824175e-06, + "loss": 0.4312, + "step": 9659, + "task_loss": 0.6875790953636169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28631240129470825, + "epoch": 8.17, + "learning_rate": 9.171597633136095e-06, + "loss": 0.3234, + "step": 9660, + "task_loss": 1.2962371110916138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38545215129852295, + "epoch": 8.17, + "learning_rate": 9.167371090448013e-06, + "loss": 0.3394, + "step": 9661, + "task_loss": 0.37363725900650024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3250458240509033, + "epoch": 8.17, + "learning_rate": 9.163144547759933e-06, + "loss": 0.2828, + "step": 9662, + "task_loss": 0.7471491098403931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1687881499528885, + "epoch": 8.17, + "learning_rate": 9.158918005071853e-06, + "loss": 0.2651, + "step": 9663, + "task_loss": 0.0961441919207573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.299958199262619, + "epoch": 8.17, + "learning_rate": 9.15469146238377e-06, + "loss": 0.3634, + "step": 9664, + "task_loss": 0.8547005653381348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5923066139221191, + "epoch": 8.17, + "learning_rate": 9.150464919695689e-06, + "loss": 0.4031, + "step": 9665, + "task_loss": 0.9815897345542908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4318762421607971, + "epoch": 8.17, + "learning_rate": 9.146238377007608e-06, + "loss": 0.5062, + "step": 9666, + "task_loss": 0.6969980001449585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35791635513305664, + "epoch": 8.17, + "learning_rate": 9.142011834319528e-06, + "loss": 0.3011, + "step": 9667, + "task_loss": 0.42164865136146545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23975449800491333, + "epoch": 8.17, + "learning_rate": 9.137785291631446e-06, + "loss": 0.2788, + "step": 9668, + "task_loss": 0.5259520411491394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3312453031539917, + "epoch": 8.17, + "learning_rate": 9.133558748943364e-06, + "loss": 0.3475, + "step": 9669, + "task_loss": 0.928041934967041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.319538950920105, + "epoch": 8.17, + "learning_rate": 9.129332206255284e-06, + "loss": 0.35, + "step": 9670, + "task_loss": 0.7993912696838379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18613344430923462, + "epoch": 8.17, + "learning_rate": 9.125105663567204e-06, + "loss": 0.3406, + "step": 9671, + "task_loss": 0.4923597574234009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6443765759468079, + "epoch": 8.18, + "learning_rate": 9.12087912087912e-06, + "loss": 0.432, + "step": 9672, + "task_loss": 1.3538999557495117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30873745679855347, + "epoch": 8.18, + "learning_rate": 9.11665257819104e-06, + "loss": 0.3048, + "step": 9673, + "task_loss": 0.7474343180656433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32002562284469604, + "epoch": 8.18, + "learning_rate": 9.11242603550296e-06, + "loss": 0.2717, + "step": 9674, + "task_loss": 0.9239750504493713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32411304116249084, + "epoch": 8.18, + "learning_rate": 9.108199492814878e-06, + "loss": 0.2731, + "step": 9675, + "task_loss": 0.2891104817390442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31777292490005493, + "epoch": 8.18, + "learning_rate": 9.103972950126796e-06, + "loss": 0.4542, + "step": 9676, + "task_loss": 1.070147156715393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34139904379844666, + "epoch": 8.18, + "learning_rate": 9.099746407438715e-06, + "loss": 0.3966, + "step": 9677, + "task_loss": 0.21527938544750214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3053198456764221, + "epoch": 8.18, + "learning_rate": 9.095519864750635e-06, + "loss": 0.4166, + "step": 9678, + "task_loss": 0.9368047714233398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23771579563617706, + "epoch": 8.18, + "learning_rate": 9.091293322062553e-06, + "loss": 0.4659, + "step": 9679, + "task_loss": 0.3267645239830017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26075536012649536, + "epoch": 8.18, + "learning_rate": 9.087066779374471e-06, + "loss": 0.2942, + "step": 9680, + "task_loss": 0.0958411917090416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42152759432792664, + "epoch": 8.18, + "learning_rate": 9.082840236686391e-06, + "loss": 0.3547, + "step": 9681, + "task_loss": 0.39244091510772705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3182089030742645, + "epoch": 8.18, + "learning_rate": 9.07861369399831e-06, + "loss": 0.4266, + "step": 9682, + "task_loss": 0.2878417670726776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33279967308044434, + "epoch": 8.19, + "learning_rate": 9.074387151310229e-06, + "loss": 0.4432, + "step": 9683, + "task_loss": 0.1174153983592987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40457475185394287, + "epoch": 8.19, + "learning_rate": 9.070160608622147e-06, + "loss": 0.3944, + "step": 9684, + "task_loss": 0.8658977746963501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29502081871032715, + "epoch": 8.19, + "learning_rate": 9.065934065934067e-06, + "loss": 0.2998, + "step": 9685, + "task_loss": 1.1563267707824707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4002039134502411, + "epoch": 8.19, + "learning_rate": 9.061707523245985e-06, + "loss": 0.3236, + "step": 9686, + "task_loss": 1.067454218864441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4502770006656647, + "epoch": 8.19, + "learning_rate": 9.057480980557904e-06, + "loss": 0.414, + "step": 9687, + "task_loss": 0.9178832173347473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1718170940876007, + "epoch": 8.19, + "learning_rate": 9.053254437869822e-06, + "loss": 0.4125, + "step": 9688, + "task_loss": 0.5020856857299805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18060216307640076, + "epoch": 8.19, + "learning_rate": 9.049027895181742e-06, + "loss": 0.4159, + "step": 9689, + "task_loss": 0.030078237876296043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27984631061553955, + "epoch": 8.19, + "learning_rate": 9.04480135249366e-06, + "loss": 0.4477, + "step": 9690, + "task_loss": 0.9567302465438843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27652040123939514, + "epoch": 8.19, + "learning_rate": 9.04057480980558e-06, + "loss": 0.3729, + "step": 9691, + "task_loss": 0.6109839081764221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5897128582000732, + "epoch": 8.19, + "learning_rate": 9.0363482671175e-06, + "loss": 0.549, + "step": 9692, + "task_loss": 0.9462595582008362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24445638060569763, + "epoch": 8.19, + "learning_rate": 9.032121724429416e-06, + "loss": 0.3746, + "step": 9693, + "task_loss": 0.09894224256277084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3792136311531067, + "epoch": 8.19, + "learning_rate": 9.027895181741336e-06, + "loss": 0.3604, + "step": 9694, + "task_loss": 0.9250402450561523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2336384654045105, + "epoch": 8.2, + "learning_rate": 9.023668639053255e-06, + "loss": 0.321, + "step": 9695, + "task_loss": 0.8424035906791687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5347297191619873, + "epoch": 8.2, + "learning_rate": 9.019442096365173e-06, + "loss": 0.4832, + "step": 9696, + "task_loss": 0.27140286564826965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2559162378311157, + "epoch": 8.2, + "learning_rate": 9.015215553677092e-06, + "loss": 0.3016, + "step": 9697, + "task_loss": 0.12297423928976059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1972401887178421, + "epoch": 8.2, + "learning_rate": 9.010989010989011e-06, + "loss": 0.3652, + "step": 9698, + "task_loss": 0.7072030901908875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2507309019565582, + "epoch": 8.2, + "learning_rate": 9.006762468300931e-06, + "loss": 0.3587, + "step": 9699, + "task_loss": 0.12716975808143616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35743248462677, + "epoch": 8.2, + "learning_rate": 9.002535925612849e-06, + "loss": 0.3417, + "step": 9700, + "task_loss": 0.4626259207725525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2359066754579544, + "epoch": 8.2, + "learning_rate": 8.998309382924767e-06, + "loss": 0.2987, + "step": 9701, + "task_loss": 0.16904082894325256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2418484091758728, + "epoch": 8.2, + "learning_rate": 8.994082840236687e-06, + "loss": 0.3256, + "step": 9702, + "task_loss": 0.4192400872707367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17623473703861237, + "epoch": 8.2, + "learning_rate": 8.989856297548607e-06, + "loss": 0.3374, + "step": 9703, + "task_loss": 0.8667232394218445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4268370270729065, + "epoch": 8.2, + "learning_rate": 8.985629754860525e-06, + "loss": 0.406, + "step": 9704, + "task_loss": 1.2267112731933594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4142894744873047, + "epoch": 8.2, + "learning_rate": 8.981403212172443e-06, + "loss": 0.4862, + "step": 9705, + "task_loss": 0.9634127616882324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4781948924064636, + "epoch": 8.2, + "learning_rate": 8.977176669484362e-06, + "loss": 0.5278, + "step": 9706, + "task_loss": 0.8409520387649536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31174859404563904, + "epoch": 8.21, + "learning_rate": 8.97295012679628e-06, + "loss": 0.2603, + "step": 9707, + "task_loss": 0.25963011384010315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5368709564208984, + "epoch": 8.21, + "learning_rate": 8.9687235841082e-06, + "loss": 0.4164, + "step": 9708, + "task_loss": 1.1820242404937744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35164427757263184, + "epoch": 8.21, + "learning_rate": 8.964497041420118e-06, + "loss": 0.4737, + "step": 9709, + "task_loss": 0.6053301692008972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22200173139572144, + "epoch": 8.21, + "learning_rate": 8.960270498732038e-06, + "loss": 0.2778, + "step": 9710, + "task_loss": 0.1624266505241394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2969588339328766, + "epoch": 8.21, + "learning_rate": 8.956043956043956e-06, + "loss": 0.3573, + "step": 9711, + "task_loss": 0.5570029616355896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4154408574104309, + "epoch": 8.21, + "learning_rate": 8.951817413355876e-06, + "loss": 0.4129, + "step": 9712, + "task_loss": 0.13733650743961334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4138985574245453, + "epoch": 8.21, + "learning_rate": 8.947590870667794e-06, + "loss": 0.4428, + "step": 9713, + "task_loss": 0.4884899854660034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3642539381980896, + "epoch": 8.21, + "learning_rate": 8.943364327979714e-06, + "loss": 0.3472, + "step": 9714, + "task_loss": 0.6372573375701904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2805880606174469, + "epoch": 8.21, + "learning_rate": 8.939137785291632e-06, + "loss": 0.379, + "step": 9715, + "task_loss": 0.6021783947944641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20688527822494507, + "epoch": 8.21, + "learning_rate": 8.934911242603551e-06, + "loss": 0.3272, + "step": 9716, + "task_loss": 0.31165313720703125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30859261751174927, + "epoch": 8.21, + "learning_rate": 8.93068469991547e-06, + "loss": 0.3735, + "step": 9717, + "task_loss": 0.10539247840642929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4432902932167053, + "epoch": 8.21, + "learning_rate": 8.926458157227387e-06, + "loss": 0.3591, + "step": 9718, + "task_loss": 0.9530718922615051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.678508460521698, + "epoch": 8.22, + "learning_rate": 8.922231614539307e-06, + "loss": 0.462, + "step": 9719, + "task_loss": 0.8696460723876953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8330650925636292, + "epoch": 8.22, + "learning_rate": 8.918005071851227e-06, + "loss": 0.495, + "step": 9720, + "task_loss": 0.858477771282196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31208711862564087, + "epoch": 8.22, + "learning_rate": 8.913778529163145e-06, + "loss": 0.3016, + "step": 9721, + "task_loss": 1.2879819869995117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40455615520477295, + "epoch": 8.22, + "learning_rate": 8.909551986475063e-06, + "loss": 0.4065, + "step": 9722, + "task_loss": 0.3742080330848694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5005300641059875, + "epoch": 8.22, + "learning_rate": 8.905325443786983e-06, + "loss": 0.3483, + "step": 9723, + "task_loss": 0.8587540984153748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20778794586658478, + "epoch": 8.22, + "learning_rate": 8.901098901098902e-06, + "loss": 0.3045, + "step": 9724, + "task_loss": 0.11371275782585144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23917904496192932, + "epoch": 8.22, + "learning_rate": 8.89687235841082e-06, + "loss": 0.3445, + "step": 9725, + "task_loss": 0.12882114946842194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33064234256744385, + "epoch": 8.22, + "learning_rate": 8.892645815722739e-06, + "loss": 0.3554, + "step": 9726, + "task_loss": 0.4286997616291046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.538514256477356, + "epoch": 8.22, + "learning_rate": 8.888419273034658e-06, + "loss": 0.4464, + "step": 9727, + "task_loss": 0.7297476530075073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2555297017097473, + "epoch": 8.22, + "learning_rate": 8.884192730346576e-06, + "loss": 0.3279, + "step": 9728, + "task_loss": 0.6955799460411072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3818511366844177, + "epoch": 8.22, + "learning_rate": 8.879966187658496e-06, + "loss": 0.4628, + "step": 9729, + "task_loss": 1.0754139423370361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24143879115581512, + "epoch": 8.22, + "learning_rate": 8.875739644970414e-06, + "loss": 0.2392, + "step": 9730, + "task_loss": 0.2514159679412842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19504114985466003, + "epoch": 8.23, + "learning_rate": 8.871513102282334e-06, + "loss": 0.2869, + "step": 9731, + "task_loss": 0.8008387684822083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22287370264530182, + "epoch": 8.23, + "learning_rate": 8.867286559594252e-06, + "loss": 0.3264, + "step": 9732, + "task_loss": 0.4593740403652191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3432191014289856, + "epoch": 8.23, + "learning_rate": 8.863060016906172e-06, + "loss": 0.4326, + "step": 9733, + "task_loss": 0.6860237717628479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4067971110343933, + "epoch": 8.23, + "learning_rate": 8.85883347421809e-06, + "loss": 0.322, + "step": 9734, + "task_loss": 0.3725316822528839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.260647714138031, + "epoch": 8.23, + "learning_rate": 8.85460693153001e-06, + "loss": 0.3818, + "step": 9735, + "task_loss": 0.2765454947948456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4696386456489563, + "epoch": 8.23, + "learning_rate": 8.850380388841927e-06, + "loss": 0.5069, + "step": 9736, + "task_loss": 0.48660120368003845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4835379719734192, + "epoch": 8.23, + "learning_rate": 8.846153846153847e-06, + "loss": 0.4994, + "step": 9737, + "task_loss": 0.6464107036590576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21387115120887756, + "epoch": 8.23, + "learning_rate": 8.841927303465765e-06, + "loss": 0.319, + "step": 9738, + "task_loss": 0.34822961688041687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3433699905872345, + "epoch": 8.23, + "learning_rate": 8.837700760777683e-06, + "loss": 0.3185, + "step": 9739, + "task_loss": 0.9515796303749084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5184146761894226, + "epoch": 8.23, + "learning_rate": 8.833474218089603e-06, + "loss": 0.5044, + "step": 9740, + "task_loss": 1.6453769207000732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4171653389930725, + "epoch": 8.23, + "learning_rate": 8.829247675401523e-06, + "loss": 0.3496, + "step": 9741, + "task_loss": 0.282680869102478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32187342643737793, + "epoch": 8.23, + "learning_rate": 8.82502113271344e-06, + "loss": 0.3367, + "step": 9742, + "task_loss": 0.8890480399131775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3497220575809479, + "epoch": 8.24, + "learning_rate": 8.820794590025359e-06, + "loss": 0.3651, + "step": 9743, + "task_loss": 0.7869232892990112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42103883624076843, + "epoch": 8.24, + "learning_rate": 8.816568047337279e-06, + "loss": 0.3192, + "step": 9744, + "task_loss": 0.5570975542068481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5558339953422546, + "epoch": 8.24, + "learning_rate": 8.812341504649198e-06, + "loss": 0.3757, + "step": 9745, + "task_loss": 1.01534903049469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3667979836463928, + "epoch": 8.24, + "learning_rate": 8.808114961961116e-06, + "loss": 0.4696, + "step": 9746, + "task_loss": 0.8238222002983093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3211827874183655, + "epoch": 8.24, + "learning_rate": 8.803888419273034e-06, + "loss": 0.3532, + "step": 9747, + "task_loss": 0.29792577028274536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2905444800853729, + "epoch": 8.24, + "learning_rate": 8.799661876584954e-06, + "loss": 0.3396, + "step": 9748, + "task_loss": 0.5318219661712646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26383304595947266, + "epoch": 8.24, + "learning_rate": 8.795435333896874e-06, + "loss": 0.3369, + "step": 9749, + "task_loss": 0.9646943807601929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2157078981399536, + "epoch": 8.24, + "learning_rate": 8.791208791208792e-06, + "loss": 0.2285, + "step": 9750, + "task_loss": 0.3005932867527008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4590211510658264, + "epoch": 8.24, + "learning_rate": 8.78698224852071e-06, + "loss": 0.4223, + "step": 9751, + "task_loss": 0.9414132237434387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2220081239938736, + "epoch": 8.24, + "learning_rate": 8.78275570583263e-06, + "loss": 0.3047, + "step": 9752, + "task_loss": 0.19204185903072357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3985971510410309, + "epoch": 8.24, + "learning_rate": 8.778529163144548e-06, + "loss": 0.3543, + "step": 9753, + "task_loss": 0.2390933483839035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30922242999076843, + "epoch": 8.24, + "learning_rate": 8.774302620456468e-06, + "loss": 0.3615, + "step": 9754, + "task_loss": 0.23663471639156342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4071299433708191, + "epoch": 8.25, + "learning_rate": 8.770076077768386e-06, + "loss": 0.4329, + "step": 9755, + "task_loss": 0.7447525858879089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41912806034088135, + "epoch": 8.25, + "learning_rate": 8.765849535080305e-06, + "loss": 0.3616, + "step": 9756, + "task_loss": 0.19413790106773376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34253525733947754, + "epoch": 8.25, + "learning_rate": 8.761622992392223e-06, + "loss": 0.404, + "step": 9757, + "task_loss": 0.7007615566253662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4242688715457916, + "epoch": 8.25, + "learning_rate": 8.757396449704143e-06, + "loss": 0.3737, + "step": 9758, + "task_loss": 0.6549898982048035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24502715468406677, + "epoch": 8.25, + "learning_rate": 8.753169907016061e-06, + "loss": 0.3152, + "step": 9759, + "task_loss": 0.6109122037887573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15375986695289612, + "epoch": 8.25, + "learning_rate": 8.74894336432798e-06, + "loss": 0.3072, + "step": 9760, + "task_loss": 0.29879170656204224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4089164137840271, + "epoch": 8.25, + "learning_rate": 8.744716821639899e-06, + "loss": 0.3149, + "step": 9761, + "task_loss": 0.6299025416374207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.386701762676239, + "epoch": 8.25, + "learning_rate": 8.740490278951819e-06, + "loss": 0.414, + "step": 9762, + "task_loss": 0.2943950891494751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19215984642505646, + "epoch": 8.25, + "learning_rate": 8.736263736263737e-06, + "loss": 0.3614, + "step": 9763, + "task_loss": 0.6087113618850708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28922000527381897, + "epoch": 8.25, + "learning_rate": 8.732037193575655e-06, + "loss": 0.3482, + "step": 9764, + "task_loss": 0.4135283827781677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40279993414878845, + "epoch": 8.25, + "learning_rate": 8.727810650887574e-06, + "loss": 0.3107, + "step": 9765, + "task_loss": 0.5130314230918884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3185824155807495, + "epoch": 8.26, + "learning_rate": 8.723584108199494e-06, + "loss": 0.4994, + "step": 9766, + "task_loss": 0.2831957936286926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2042769491672516, + "epoch": 8.26, + "learning_rate": 8.719357565511412e-06, + "loss": 0.3691, + "step": 9767, + "task_loss": 0.3684547245502472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27875420451164246, + "epoch": 8.26, + "learning_rate": 8.71513102282333e-06, + "loss": 0.379, + "step": 9768, + "task_loss": 0.7846376895904541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4328498840332031, + "epoch": 8.26, + "learning_rate": 8.71090448013525e-06, + "loss": 0.5883, + "step": 9769, + "task_loss": 0.4839381277561188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49871253967285156, + "epoch": 8.26, + "learning_rate": 8.70667793744717e-06, + "loss": 0.4333, + "step": 9770, + "task_loss": 0.3765486478805542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36395588517189026, + "epoch": 8.26, + "learning_rate": 8.702451394759086e-06, + "loss": 0.3618, + "step": 9771, + "task_loss": 0.21075135469436646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40310561656951904, + "epoch": 8.26, + "learning_rate": 8.698224852071006e-06, + "loss": 0.4033, + "step": 9772, + "task_loss": 0.6758021712303162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24684014916419983, + "epoch": 8.26, + "learning_rate": 8.693998309382926e-06, + "loss": 0.3499, + "step": 9773, + "task_loss": 0.10768115520477295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28706932067871094, + "epoch": 8.26, + "learning_rate": 8.689771766694844e-06, + "loss": 0.3335, + "step": 9774, + "task_loss": 0.1497696489095688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36292895674705505, + "epoch": 8.26, + "learning_rate": 8.685545224006762e-06, + "loss": 0.3654, + "step": 9775, + "task_loss": 0.5850411653518677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4508146643638611, + "epoch": 8.26, + "learning_rate": 8.681318681318681e-06, + "loss": 0.3083, + "step": 9776, + "task_loss": 1.4609848260879517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34915974736213684, + "epoch": 8.26, + "learning_rate": 8.677092138630601e-06, + "loss": 0.4138, + "step": 9777, + "task_loss": 0.7426593899726868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.615005373954773, + "epoch": 8.27, + "learning_rate": 8.67286559594252e-06, + "loss": 0.4669, + "step": 9778, + "task_loss": 0.573525071144104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17758822441101074, + "epoch": 8.27, + "learning_rate": 8.668639053254437e-06, + "loss": 0.3033, + "step": 9779, + "task_loss": 0.8016296625137329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16171646118164062, + "epoch": 8.27, + "learning_rate": 8.664412510566357e-06, + "loss": 0.3355, + "step": 9780, + "task_loss": 0.05476692318916321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44864290952682495, + "epoch": 8.27, + "learning_rate": 8.660185967878277e-06, + "loss": 0.4514, + "step": 9781, + "task_loss": 1.1707797050476074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2376815378665924, + "epoch": 8.27, + "learning_rate": 8.655959425190195e-06, + "loss": 0.3855, + "step": 9782, + "task_loss": 0.19468216598033905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3082500696182251, + "epoch": 8.27, + "learning_rate": 8.651732882502115e-06, + "loss": 0.3407, + "step": 9783, + "task_loss": 0.6495285034179688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6310072541236877, + "epoch": 8.27, + "learning_rate": 8.647506339814033e-06, + "loss": 0.4792, + "step": 9784, + "task_loss": 1.4811997413635254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33435362577438354, + "epoch": 8.27, + "learning_rate": 8.64327979712595e-06, + "loss": 0.4131, + "step": 9785, + "task_loss": 0.7153709530830383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33940380811691284, + "epoch": 8.27, + "learning_rate": 8.63905325443787e-06, + "loss": 0.3645, + "step": 9786, + "task_loss": 1.0010336637496948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35735416412353516, + "epoch": 8.27, + "learning_rate": 8.63482671174979e-06, + "loss": 0.3711, + "step": 9787, + "task_loss": 0.38904234766960144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3098819851875305, + "epoch": 8.27, + "learning_rate": 8.630600169061708e-06, + "loss": 0.329, + "step": 9788, + "task_loss": 0.8288509845733643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36749595403671265, + "epoch": 8.27, + "learning_rate": 8.626373626373626e-06, + "loss": 0.3569, + "step": 9789, + "task_loss": 0.582553505897522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45635986328125, + "epoch": 8.28, + "learning_rate": 8.622147083685546e-06, + "loss": 0.2987, + "step": 9790, + "task_loss": 0.30947017669677734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2071198672056198, + "epoch": 8.28, + "learning_rate": 8.617920540997466e-06, + "loss": 0.3581, + "step": 9791, + "task_loss": 0.6297392845153809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29579681158065796, + "epoch": 8.28, + "learning_rate": 8.613693998309382e-06, + "loss": 0.2889, + "step": 9792, + "task_loss": 1.2318494319915771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4114759862422943, + "epoch": 8.28, + "learning_rate": 8.609467455621302e-06, + "loss": 0.36, + "step": 9793, + "task_loss": 0.5111846923828125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44780099391937256, + "epoch": 8.28, + "learning_rate": 8.605240912933221e-06, + "loss": 0.3257, + "step": 9794, + "task_loss": 0.6333122849464417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3183003067970276, + "epoch": 8.28, + "learning_rate": 8.601014370245141e-06, + "loss": 0.272, + "step": 9795, + "task_loss": 0.6065946221351624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6599653363227844, + "epoch": 8.28, + "learning_rate": 8.596787827557058e-06, + "loss": 0.4934, + "step": 9796, + "task_loss": 0.48435166478157043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41480788588523865, + "epoch": 8.28, + "learning_rate": 8.592561284868977e-06, + "loss": 0.4061, + "step": 9797, + "task_loss": 1.2546712160110474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3400757312774658, + "epoch": 8.28, + "learning_rate": 8.588334742180897e-06, + "loss": 0.2292, + "step": 9798, + "task_loss": 0.17973552644252777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28583288192749023, + "epoch": 8.28, + "learning_rate": 8.584108199492815e-06, + "loss": 0.3209, + "step": 9799, + "task_loss": 0.3710036277770996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.9102689027786255, + "epoch": 8.28, + "learning_rate": 8.579881656804733e-06, + "loss": 0.4699, + "step": 9800, + "task_loss": 0.6184324622154236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3006907105445862, + "epoch": 8.28, + "learning_rate": 8.575655114116653e-06, + "loss": 0.309, + "step": 9801, + "task_loss": 0.6547111868858337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25536441802978516, + "epoch": 8.29, + "learning_rate": 8.571428571428573e-06, + "loss": 0.3756, + "step": 9802, + "task_loss": 0.476369172334671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19718025624752045, + "epoch": 8.29, + "learning_rate": 8.56720202874049e-06, + "loss": 0.4194, + "step": 9803, + "task_loss": 0.5830941796302795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24336951971054077, + "epoch": 8.29, + "learning_rate": 8.562975486052409e-06, + "loss": 0.2964, + "step": 9804, + "task_loss": 0.26734843850135803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27275145053863525, + "epoch": 8.29, + "learning_rate": 8.558748943364328e-06, + "loss": 0.394, + "step": 9805, + "task_loss": 0.3465957045555115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25579655170440674, + "epoch": 8.29, + "learning_rate": 8.554522400676246e-06, + "loss": 0.2906, + "step": 9806, + "task_loss": 0.732319712638855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.624285101890564, + "epoch": 8.29, + "learning_rate": 8.550295857988166e-06, + "loss": 0.4672, + "step": 9807, + "task_loss": 0.6681028008460999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15036822855472565, + "epoch": 8.29, + "learning_rate": 8.546069315300084e-06, + "loss": 0.2846, + "step": 9808, + "task_loss": 0.31999534368515015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4442770779132843, + "epoch": 8.29, + "learning_rate": 8.541842772612004e-06, + "loss": 0.4437, + "step": 9809, + "task_loss": 0.9391199350357056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2138526886701584, + "epoch": 8.29, + "learning_rate": 8.537616229923922e-06, + "loss": 0.2339, + "step": 9810, + "task_loss": 0.472034215927124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36865174770355225, + "epoch": 8.29, + "learning_rate": 8.533389687235842e-06, + "loss": 0.2825, + "step": 9811, + "task_loss": 0.5916270017623901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18036213517189026, + "epoch": 8.29, + "learning_rate": 8.529163144547762e-06, + "loss": 0.2932, + "step": 9812, + "task_loss": 0.1685146987438202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2766318917274475, + "epoch": 8.29, + "learning_rate": 8.52493660185968e-06, + "loss": 0.3511, + "step": 9813, + "task_loss": 0.1905214786529541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.273795485496521, + "epoch": 8.3, + "learning_rate": 8.520710059171598e-06, + "loss": 0.4107, + "step": 9814, + "task_loss": 0.695587158203125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18488679826259613, + "epoch": 8.3, + "learning_rate": 8.516483516483517e-06, + "loss": 0.2454, + "step": 9815, + "task_loss": 0.07990279048681259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46626415848731995, + "epoch": 8.3, + "learning_rate": 8.512256973795437e-06, + "loss": 0.3592, + "step": 9816, + "task_loss": 0.6682854890823364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.220209002494812, + "epoch": 8.3, + "learning_rate": 8.508030431107353e-06, + "loss": 0.4499, + "step": 9817, + "task_loss": 0.30251544713974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18685534596443176, + "epoch": 8.3, + "learning_rate": 8.503803888419273e-06, + "loss": 0.284, + "step": 9818, + "task_loss": 0.5266330242156982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18420666456222534, + "epoch": 8.3, + "learning_rate": 8.499577345731193e-06, + "loss": 0.3046, + "step": 9819, + "task_loss": 0.4369259476661682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23351049423217773, + "epoch": 8.3, + "learning_rate": 8.495350803043111e-06, + "loss": 0.4464, + "step": 9820, + "task_loss": 0.788015604019165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40722036361694336, + "epoch": 8.3, + "learning_rate": 8.491124260355029e-06, + "loss": 0.3683, + "step": 9821, + "task_loss": 1.1177533864974976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5063949823379517, + "epoch": 8.3, + "learning_rate": 8.486897717666949e-06, + "loss": 0.416, + "step": 9822, + "task_loss": 1.0140175819396973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3399730622768402, + "epoch": 8.3, + "learning_rate": 8.482671174978868e-06, + "loss": 0.3062, + "step": 9823, + "task_loss": 1.3542838096618652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44571805000305176, + "epoch": 8.3, + "learning_rate": 8.478444632290787e-06, + "loss": 0.3215, + "step": 9824, + "task_loss": 1.4812110662460327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16501504182815552, + "epoch": 8.3, + "learning_rate": 8.474218089602705e-06, + "loss": 0.3927, + "step": 9825, + "task_loss": 0.07019282877445221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6326087713241577, + "epoch": 8.31, + "learning_rate": 8.469991546914624e-06, + "loss": 0.4385, + "step": 9826, + "task_loss": 0.4715222120285034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34353554248809814, + "epoch": 8.31, + "learning_rate": 8.465765004226544e-06, + "loss": 0.2911, + "step": 9827, + "task_loss": 0.3344884216785431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45326435565948486, + "epoch": 8.31, + "learning_rate": 8.461538461538462e-06, + "loss": 0.4698, + "step": 9828, + "task_loss": 1.0761221647262573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15374648571014404, + "epoch": 8.31, + "learning_rate": 8.45731191885038e-06, + "loss": 0.2651, + "step": 9829, + "task_loss": 0.36588919162750244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4200078547000885, + "epoch": 8.31, + "learning_rate": 8.4530853761623e-06, + "loss": 0.3407, + "step": 9830, + "task_loss": 0.9219998717308044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30679208040237427, + "epoch": 8.31, + "learning_rate": 8.448858833474218e-06, + "loss": 0.4453, + "step": 9831, + "task_loss": 0.6855963468551636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5602876543998718, + "epoch": 8.31, + "learning_rate": 8.444632290786138e-06, + "loss": 0.3933, + "step": 9832, + "task_loss": 0.6793924570083618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3089636564254761, + "epoch": 8.31, + "learning_rate": 8.440405748098056e-06, + "loss": 0.2913, + "step": 9833, + "task_loss": 0.6300331354141235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.598201334476471, + "epoch": 8.31, + "learning_rate": 8.436179205409975e-06, + "loss": 0.4884, + "step": 9834, + "task_loss": 1.1318031549453735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42228835821151733, + "epoch": 8.31, + "learning_rate": 8.431952662721893e-06, + "loss": 0.3413, + "step": 9835, + "task_loss": 0.6755003929138184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7260577082633972, + "epoch": 8.31, + "learning_rate": 8.427726120033813e-06, + "loss": 0.5329, + "step": 9836, + "task_loss": 0.8321139216423035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4221124053001404, + "epoch": 8.32, + "learning_rate": 8.423499577345731e-06, + "loss": 0.4089, + "step": 9837, + "task_loss": 0.7170254588127136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25226616859436035, + "epoch": 8.32, + "learning_rate": 8.41927303465765e-06, + "loss": 0.3621, + "step": 9838, + "task_loss": 1.1626739501953125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36467647552490234, + "epoch": 8.32, + "learning_rate": 8.415046491969569e-06, + "loss": 0.4305, + "step": 9839, + "task_loss": 0.4601993262767792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34807509183883667, + "epoch": 8.32, + "learning_rate": 8.410819949281489e-06, + "loss": 0.332, + "step": 9840, + "task_loss": 1.4866724014282227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3440382778644562, + "epoch": 8.32, + "learning_rate": 8.406593406593407e-06, + "loss": 0.2649, + "step": 9841, + "task_loss": 0.43716341257095337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17226433753967285, + "epoch": 8.32, + "learning_rate": 8.402366863905325e-06, + "loss": 0.383, + "step": 9842, + "task_loss": 0.30462706089019775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27353745698928833, + "epoch": 8.32, + "learning_rate": 8.398140321217245e-06, + "loss": 0.3829, + "step": 9843, + "task_loss": 1.0205519199371338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36124366521835327, + "epoch": 8.32, + "learning_rate": 8.393913778529164e-06, + "loss": 0.4015, + "step": 9844, + "task_loss": 0.7810443639755249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3139660358428955, + "epoch": 8.32, + "learning_rate": 8.389687235841082e-06, + "loss": 0.3712, + "step": 9845, + "task_loss": 0.5591665506362915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2383313775062561, + "epoch": 8.32, + "learning_rate": 8.385460693153e-06, + "loss": 0.343, + "step": 9846, + "task_loss": 0.4618859887123108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4592198133468628, + "epoch": 8.32, + "learning_rate": 8.38123415046492e-06, + "loss": 0.3406, + "step": 9847, + "task_loss": 0.802821695804596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.355598121881485, + "epoch": 8.32, + "learning_rate": 8.37700760777684e-06, + "loss": 0.2888, + "step": 9848, + "task_loss": 0.3750664293766022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4725492000579834, + "epoch": 8.33, + "learning_rate": 8.372781065088758e-06, + "loss": 0.3488, + "step": 9849, + "task_loss": 0.9732497334480286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48520582914352417, + "epoch": 8.33, + "learning_rate": 8.368554522400676e-06, + "loss": 0.3849, + "step": 9850, + "task_loss": 1.0766435861587524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19553996622562408, + "epoch": 8.33, + "learning_rate": 8.364327979712596e-06, + "loss": 0.2692, + "step": 9851, + "task_loss": 0.5403403043746948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26956072449684143, + "epoch": 8.33, + "learning_rate": 8.360101437024514e-06, + "loss": 0.3846, + "step": 9852, + "task_loss": 0.29928338527679443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5034545063972473, + "epoch": 8.33, + "learning_rate": 8.355874894336434e-06, + "loss": 0.4389, + "step": 9853, + "task_loss": 0.850502073764801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.655781090259552, + "epoch": 8.33, + "learning_rate": 8.351648351648352e-06, + "loss": 0.3631, + "step": 9854, + "task_loss": 0.6750800013542175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27420270442962646, + "epoch": 8.33, + "learning_rate": 8.347421808960271e-06, + "loss": 0.3808, + "step": 9855, + "task_loss": 0.6062148213386536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29507923126220703, + "epoch": 8.33, + "learning_rate": 8.34319526627219e-06, + "loss": 0.2839, + "step": 9856, + "task_loss": 0.12409472465515137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39879393577575684, + "epoch": 8.33, + "learning_rate": 8.338968723584109e-06, + "loss": 0.4118, + "step": 9857, + "task_loss": 0.5695974230766296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38156503438949585, + "epoch": 8.33, + "learning_rate": 8.334742180896027e-06, + "loss": 0.3743, + "step": 9858, + "task_loss": 0.6405352354049683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6437473297119141, + "epoch": 8.33, + "learning_rate": 8.330515638207947e-06, + "loss": 0.412, + "step": 9859, + "task_loss": 0.5421026349067688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1625746786594391, + "epoch": 8.33, + "learning_rate": 8.326289095519865e-06, + "loss": 0.4397, + "step": 9860, + "task_loss": 0.24758104979991913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33773335814476013, + "epoch": 8.34, + "learning_rate": 8.322062552831785e-06, + "loss": 0.452, + "step": 9861, + "task_loss": 0.028131060302257538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.404288649559021, + "epoch": 8.34, + "learning_rate": 8.317836010143703e-06, + "loss": 0.5131, + "step": 9862, + "task_loss": 0.759227454662323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5208569765090942, + "epoch": 8.34, + "learning_rate": 8.31360946745562e-06, + "loss": 0.399, + "step": 9863, + "task_loss": 0.7569478154182434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31458181142807007, + "epoch": 8.34, + "learning_rate": 8.30938292476754e-06, + "loss": 0.4308, + "step": 9864, + "task_loss": 0.4518139362335205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5279165506362915, + "epoch": 8.34, + "learning_rate": 8.30515638207946e-06, + "loss": 0.4692, + "step": 9865, + "task_loss": 0.3020053505897522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27899041771888733, + "epoch": 8.34, + "learning_rate": 8.300929839391378e-06, + "loss": 0.5797, + "step": 9866, + "task_loss": 0.6417058110237122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23574316501617432, + "epoch": 8.34, + "learning_rate": 8.296703296703296e-06, + "loss": 0.369, + "step": 9867, + "task_loss": 0.2814878523349762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3385157883167267, + "epoch": 8.34, + "learning_rate": 8.292476754015216e-06, + "loss": 0.3535, + "step": 9868, + "task_loss": 0.19284862279891968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.337044358253479, + "epoch": 8.34, + "learning_rate": 8.288250211327136e-06, + "loss": 0.3136, + "step": 9869, + "task_loss": 0.25778377056121826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38773009181022644, + "epoch": 8.34, + "learning_rate": 8.284023668639054e-06, + "loss": 0.3432, + "step": 9870, + "task_loss": 0.5943514108657837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4057808518409729, + "epoch": 8.34, + "learning_rate": 8.279797125950972e-06, + "loss": 0.3282, + "step": 9871, + "task_loss": 0.13423751294612885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47784048318862915, + "epoch": 8.34, + "learning_rate": 8.275570583262892e-06, + "loss": 0.3641, + "step": 9872, + "task_loss": 0.3588549792766571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3793264627456665, + "epoch": 8.35, + "learning_rate": 8.27134404057481e-06, + "loss": 0.5567, + "step": 9873, + "task_loss": 0.07439304143190384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4812926948070526, + "epoch": 8.35, + "learning_rate": 8.26711749788673e-06, + "loss": 0.4656, + "step": 9874, + "task_loss": 1.0317496061325073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3727072477340698, + "epoch": 8.35, + "learning_rate": 8.262890955198647e-06, + "loss": 0.4308, + "step": 9875, + "task_loss": 1.0501117706298828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29243168234825134, + "epoch": 8.35, + "learning_rate": 8.258664412510567e-06, + "loss": 0.311, + "step": 9876, + "task_loss": 0.05869528278708458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29091677069664, + "epoch": 8.35, + "learning_rate": 8.254437869822485e-06, + "loss": 0.5337, + "step": 9877, + "task_loss": 0.3071219027042389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37691131234169006, + "epoch": 8.35, + "learning_rate": 8.250211327134405e-06, + "loss": 0.4297, + "step": 9878, + "task_loss": 0.5585564374923706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5153303146362305, + "epoch": 8.35, + "learning_rate": 8.245984784446323e-06, + "loss": 0.3111, + "step": 9879, + "task_loss": 0.35851961374282837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29009193181991577, + "epoch": 8.35, + "learning_rate": 8.241758241758243e-06, + "loss": 0.3515, + "step": 9880, + "task_loss": 0.05839831382036209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4133725166320801, + "epoch": 8.35, + "learning_rate": 8.23753169907016e-06, + "loss": 0.3446, + "step": 9881, + "task_loss": 0.6248162984848022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4489487409591675, + "epoch": 8.35, + "learning_rate": 8.23330515638208e-06, + "loss": 0.3599, + "step": 9882, + "task_loss": 0.5395919680595398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49348533153533936, + "epoch": 8.35, + "learning_rate": 8.229078613693999e-06, + "loss": 0.4575, + "step": 9883, + "task_loss": 0.8853915929794312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3440357744693756, + "epoch": 8.35, + "learning_rate": 8.224852071005917e-06, + "loss": 0.4448, + "step": 9884, + "task_loss": 0.2521446645259857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5212501883506775, + "epoch": 8.36, + "learning_rate": 8.220625528317836e-06, + "loss": 0.3489, + "step": 9885, + "task_loss": 0.8031014204025269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40825706720352173, + "epoch": 8.36, + "learning_rate": 8.216398985629756e-06, + "loss": 0.3054, + "step": 9886, + "task_loss": 0.4760590195655823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30783775448799133, + "epoch": 8.36, + "learning_rate": 8.212172442941674e-06, + "loss": 0.3707, + "step": 9887, + "task_loss": 1.3281574249267578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4840999245643616, + "epoch": 8.36, + "learning_rate": 8.207945900253592e-06, + "loss": 0.4185, + "step": 9888, + "task_loss": 0.356241375207901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40270790457725525, + "epoch": 8.36, + "learning_rate": 8.203719357565512e-06, + "loss": 0.3681, + "step": 9889, + "task_loss": 0.49908339977264404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1393299549818039, + "epoch": 8.36, + "learning_rate": 8.199492814877432e-06, + "loss": 0.397, + "step": 9890, + "task_loss": 0.5084965825080872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33766070008277893, + "epoch": 8.36, + "learning_rate": 8.19526627218935e-06, + "loss": 0.3042, + "step": 9891, + "task_loss": 0.6804378628730774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2858445942401886, + "epoch": 8.36, + "learning_rate": 8.191039729501268e-06, + "loss": 0.2528, + "step": 9892, + "task_loss": 0.2925701141357422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28024572134017944, + "epoch": 8.36, + "learning_rate": 8.186813186813188e-06, + "loss": 0.4699, + "step": 9893, + "task_loss": 0.3370833694934845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39834460616111755, + "epoch": 8.36, + "learning_rate": 8.182586644125107e-06, + "loss": 0.3797, + "step": 9894, + "task_loss": 0.37989887595176697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2483496069908142, + "epoch": 8.36, + "learning_rate": 8.178360101437024e-06, + "loss": 0.3141, + "step": 9895, + "task_loss": 0.4723920226097107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23568809032440186, + "epoch": 8.36, + "learning_rate": 8.174133558748943e-06, + "loss": 0.289, + "step": 9896, + "task_loss": 0.501482367515564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2683965265750885, + "epoch": 8.37, + "learning_rate": 8.169907016060863e-06, + "loss": 0.3067, + "step": 9897, + "task_loss": 0.10031570494174957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19225439429283142, + "epoch": 8.37, + "learning_rate": 8.165680473372781e-06, + "loss": 0.3742, + "step": 9898, + "task_loss": 0.2847628891468048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47012925148010254, + "epoch": 8.37, + "learning_rate": 8.161453930684701e-06, + "loss": 0.4484, + "step": 9899, + "task_loss": 0.2704078257083893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2899574041366577, + "epoch": 8.37, + "learning_rate": 8.157227387996619e-06, + "loss": 0.4418, + "step": 9900, + "task_loss": 0.8580310940742493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4211597442626953, + "epoch": 8.37, + "learning_rate": 8.153000845308539e-06, + "loss": 0.3541, + "step": 9901, + "task_loss": 1.182708501815796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6386754512786865, + "epoch": 8.37, + "learning_rate": 8.148774302620457e-06, + "loss": 0.5302, + "step": 9902, + "task_loss": 0.18620972335338593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26958057284355164, + "epoch": 8.37, + "learning_rate": 8.144547759932376e-06, + "loss": 0.4203, + "step": 9903, + "task_loss": 0.14039455354213715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44917744398117065, + "epoch": 8.37, + "learning_rate": 8.140321217244294e-06, + "loss": 0.468, + "step": 9904, + "task_loss": 0.6625657677650452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23699799180030823, + "epoch": 8.37, + "learning_rate": 8.136094674556213e-06, + "loss": 0.2832, + "step": 9905, + "task_loss": 0.49559539556503296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4702160358428955, + "epoch": 8.37, + "learning_rate": 8.131868131868132e-06, + "loss": 0.4044, + "step": 9906, + "task_loss": 0.49343767762184143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3203650414943695, + "epoch": 8.37, + "learning_rate": 8.127641589180052e-06, + "loss": 0.4482, + "step": 9907, + "task_loss": 0.1452435553073883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39907383918762207, + "epoch": 8.38, + "learning_rate": 8.12341504649197e-06, + "loss": 0.3453, + "step": 9908, + "task_loss": 0.8481168746948242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17972984910011292, + "epoch": 8.38, + "learning_rate": 8.119188503803888e-06, + "loss": 0.3998, + "step": 9909, + "task_loss": 0.48957115411758423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18523091077804565, + "epoch": 8.38, + "learning_rate": 8.114961961115808e-06, + "loss": 0.4575, + "step": 9910, + "task_loss": 0.3024050295352936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3940434455871582, + "epoch": 8.38, + "learning_rate": 8.110735418427728e-06, + "loss": 0.4355, + "step": 9911, + "task_loss": 0.4449685215950012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47976329922676086, + "epoch": 8.38, + "learning_rate": 8.106508875739646e-06, + "loss": 0.4465, + "step": 9912, + "task_loss": 1.381178855895996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4957450330257416, + "epoch": 8.38, + "learning_rate": 8.102282333051564e-06, + "loss": 0.4842, + "step": 9913, + "task_loss": 0.246063232421875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17261847853660583, + "epoch": 8.38, + "learning_rate": 8.098055790363483e-06, + "loss": 0.399, + "step": 9914, + "task_loss": 0.31979015469551086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22802722454071045, + "epoch": 8.38, + "learning_rate": 8.093829247675403e-06, + "loss": 0.3671, + "step": 9915, + "task_loss": 0.12243936955928802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.370728999376297, + "epoch": 8.38, + "learning_rate": 8.08960270498732e-06, + "loss": 0.3505, + "step": 9916, + "task_loss": 0.6238110661506653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17308086156845093, + "epoch": 8.38, + "learning_rate": 8.08537616229924e-06, + "loss": 0.3311, + "step": 9917, + "task_loss": 0.4537776708602905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35523146390914917, + "epoch": 8.38, + "learning_rate": 8.081149619611159e-06, + "loss": 0.5825, + "step": 9918, + "task_loss": 0.9502268433570862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6385256052017212, + "epoch": 8.38, + "learning_rate": 8.076923076923077e-06, + "loss": 0.4117, + "step": 9919, + "task_loss": 0.5215417742729187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4543847143650055, + "epoch": 8.39, + "learning_rate": 8.072696534234995e-06, + "loss": 0.4833, + "step": 9920, + "task_loss": 0.31832101941108704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18559032678604126, + "epoch": 8.39, + "learning_rate": 8.068469991546915e-06, + "loss": 0.3185, + "step": 9921, + "task_loss": 0.17780497670173645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2549199163913727, + "epoch": 8.39, + "learning_rate": 8.064243448858835e-06, + "loss": 0.4007, + "step": 9922, + "task_loss": 0.40377604961395264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22342194616794586, + "epoch": 8.39, + "learning_rate": 8.060016906170753e-06, + "loss": 0.262, + "step": 9923, + "task_loss": 0.1295715868473053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20527736842632294, + "epoch": 8.39, + "learning_rate": 8.05579036348267e-06, + "loss": 0.3797, + "step": 9924, + "task_loss": 0.5065737366676331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.637683093547821, + "epoch": 8.39, + "learning_rate": 8.05156382079459e-06, + "loss": 0.3786, + "step": 9925, + "task_loss": 0.7894318103790283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3687111735343933, + "epoch": 8.39, + "learning_rate": 8.04733727810651e-06, + "loss": 0.4023, + "step": 9926, + "task_loss": 0.5464559197425842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46690598130226135, + "epoch": 8.39, + "learning_rate": 8.043110735418428e-06, + "loss": 0.5103, + "step": 9927, + "task_loss": 0.630386471748352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32390865683555603, + "epoch": 8.39, + "learning_rate": 8.038884192730348e-06, + "loss": 0.4016, + "step": 9928, + "task_loss": 1.035956859588623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3263307213783264, + "epoch": 8.39, + "learning_rate": 8.034657650042266e-06, + "loss": 0.4079, + "step": 9929, + "task_loss": 0.5266216397285461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3923707902431488, + "epoch": 8.39, + "learning_rate": 8.030431107354184e-06, + "loss": 0.4381, + "step": 9930, + "task_loss": 0.757575511932373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3148203194141388, + "epoch": 8.39, + "learning_rate": 8.026204564666104e-06, + "loss": 0.3, + "step": 9931, + "task_loss": 0.3477681279182434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22264418005943298, + "epoch": 8.4, + "learning_rate": 8.021978021978023e-06, + "loss": 0.3898, + "step": 9932, + "task_loss": 0.5065764784812927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24075189232826233, + "epoch": 8.4, + "learning_rate": 8.017751479289941e-06, + "loss": 0.3037, + "step": 9933, + "task_loss": 0.5321162939071655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20382162928581238, + "epoch": 8.4, + "learning_rate": 8.01352493660186e-06, + "loss": 0.3014, + "step": 9934, + "task_loss": 0.3911793529987335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17122229933738708, + "epoch": 8.4, + "learning_rate": 8.00929839391378e-06, + "loss": 0.2796, + "step": 9935, + "task_loss": 1.2716217041015625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25934481620788574, + "epoch": 8.4, + "learning_rate": 8.005071851225699e-06, + "loss": 0.3991, + "step": 9936, + "task_loss": 0.1272660493850708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2740709185600281, + "epoch": 8.4, + "learning_rate": 8.000845308537615e-06, + "loss": 0.2342, + "step": 9937, + "task_loss": 0.39149194955825806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.255855917930603, + "epoch": 8.4, + "learning_rate": 7.996618765849535e-06, + "loss": 0.3114, + "step": 9938, + "task_loss": 0.4888307452201843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4605828523635864, + "epoch": 8.4, + "learning_rate": 7.992392223161455e-06, + "loss": 0.3685, + "step": 9939, + "task_loss": 0.39939507842063904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3568412661552429, + "epoch": 8.4, + "learning_rate": 7.988165680473373e-06, + "loss": 0.2936, + "step": 9940, + "task_loss": 0.23691995441913605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40177398920059204, + "epoch": 8.4, + "learning_rate": 7.983939137785291e-06, + "loss": 0.4462, + "step": 9941, + "task_loss": 0.7156034111976624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2976890206336975, + "epoch": 8.4, + "learning_rate": 7.97971259509721e-06, + "loss": 0.3531, + "step": 9942, + "task_loss": 0.7515323758125305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49165937304496765, + "epoch": 8.4, + "learning_rate": 7.97548605240913e-06, + "loss": 0.3915, + "step": 9943, + "task_loss": 0.3210039734840393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5673562288284302, + "epoch": 8.41, + "learning_rate": 7.971259509721048e-06, + "loss": 0.4346, + "step": 9944, + "task_loss": 1.0063607692718506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26934075355529785, + "epoch": 8.41, + "learning_rate": 7.967032967032966e-06, + "loss": 0.2816, + "step": 9945, + "task_loss": 0.17124764621257782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20996598899364471, + "epoch": 8.41, + "learning_rate": 7.962806424344886e-06, + "loss": 0.3992, + "step": 9946, + "task_loss": 0.28219518065452576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4716126620769501, + "epoch": 8.41, + "learning_rate": 7.958579881656806e-06, + "loss": 0.4462, + "step": 9947, + "task_loss": 0.7046942114830017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2334522157907486, + "epoch": 8.41, + "learning_rate": 7.954353338968724e-06, + "loss": 0.3614, + "step": 9948, + "task_loss": 0.060270264744758606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2787330448627472, + "epoch": 8.41, + "learning_rate": 7.950126796280642e-06, + "loss": 0.3452, + "step": 9949, + "task_loss": 1.1330339908599854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20659777522087097, + "epoch": 8.41, + "learning_rate": 7.945900253592562e-06, + "loss": 0.356, + "step": 9950, + "task_loss": 0.28001347184181213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44068729877471924, + "epoch": 8.41, + "learning_rate": 7.94167371090448e-06, + "loss": 0.3424, + "step": 9951, + "task_loss": 0.7366507053375244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35117146372795105, + "epoch": 8.41, + "learning_rate": 7.9374471682164e-06, + "loss": 0.3334, + "step": 9952, + "task_loss": 0.7413483262062073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3230150640010834, + "epoch": 8.41, + "learning_rate": 7.933220625528318e-06, + "loss": 0.2723, + "step": 9953, + "task_loss": 0.3673068881034851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6200671792030334, + "epoch": 8.41, + "learning_rate": 7.928994082840237e-06, + "loss": 0.5178, + "step": 9954, + "task_loss": 0.9971244931221008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24222692847251892, + "epoch": 8.41, + "learning_rate": 7.924767540152155e-06, + "loss": 0.2965, + "step": 9955, + "task_loss": 0.6877383589744568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35239019989967346, + "epoch": 8.42, + "learning_rate": 7.920540997464075e-06, + "loss": 0.3727, + "step": 9956, + "task_loss": 0.45825764536857605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4563480615615845, + "epoch": 8.42, + "learning_rate": 7.916314454775993e-06, + "loss": 0.3432, + "step": 9957, + "task_loss": 0.7692358493804932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20112793147563934, + "epoch": 8.42, + "learning_rate": 7.912087912087913e-06, + "loss": 0.4285, + "step": 9958, + "task_loss": 0.7713651657104492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4392203688621521, + "epoch": 8.42, + "learning_rate": 7.907861369399831e-06, + "loss": 0.2882, + "step": 9959, + "task_loss": 0.7245163321495056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22501957416534424, + "epoch": 8.42, + "learning_rate": 7.90363482671175e-06, + "loss": 0.3645, + "step": 9960, + "task_loss": 0.03207080438733101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24949821829795837, + "epoch": 8.42, + "learning_rate": 7.89940828402367e-06, + "loss": 0.3796, + "step": 9961, + "task_loss": 0.1074836477637291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3178539574146271, + "epoch": 8.42, + "learning_rate": 7.895181741335587e-06, + "loss": 0.3421, + "step": 9962, + "task_loss": 0.363930344581604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45336586236953735, + "epoch": 8.42, + "learning_rate": 7.890955198647507e-06, + "loss": 0.4061, + "step": 9963, + "task_loss": 1.1299898624420166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5575181841850281, + "epoch": 8.42, + "learning_rate": 7.886728655959426e-06, + "loss": 0.4031, + "step": 9964, + "task_loss": 0.864539384841919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21565748751163483, + "epoch": 8.42, + "learning_rate": 7.882502113271344e-06, + "loss": 0.3359, + "step": 9965, + "task_loss": 0.5902684926986694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23953759670257568, + "epoch": 8.42, + "learning_rate": 7.878275570583262e-06, + "loss": 0.3289, + "step": 9966, + "task_loss": 0.4212299585342407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4753879904747009, + "epoch": 8.42, + "learning_rate": 7.874049027895182e-06, + "loss": 0.4192, + "step": 9967, + "task_loss": 0.6777235865592957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3934580683708191, + "epoch": 8.43, + "learning_rate": 7.869822485207102e-06, + "loss": 0.4313, + "step": 9968, + "task_loss": 1.122774362564087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4727416932582855, + "epoch": 8.43, + "learning_rate": 7.86559594251902e-06, + "loss": 0.3283, + "step": 9969, + "task_loss": 0.6974933743476868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3496593236923218, + "epoch": 8.43, + "learning_rate": 7.861369399830938e-06, + "loss": 0.3877, + "step": 9970, + "task_loss": 0.6283162236213684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36651259660720825, + "epoch": 8.43, + "learning_rate": 7.857142857142858e-06, + "loss": 0.3055, + "step": 9971, + "task_loss": 0.4463374614715576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27455392479896545, + "epoch": 8.43, + "learning_rate": 7.852916314454776e-06, + "loss": 0.4034, + "step": 9972, + "task_loss": 0.4421502947807312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4017808735370636, + "epoch": 8.43, + "learning_rate": 7.848689771766695e-06, + "loss": 0.4118, + "step": 9973, + "task_loss": 1.015561580657959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18567708134651184, + "epoch": 8.43, + "learning_rate": 7.844463229078613e-06, + "loss": 0.3941, + "step": 9974, + "task_loss": 0.30658280849456787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22371090948581696, + "epoch": 8.43, + "learning_rate": 7.840236686390533e-06, + "loss": 0.2463, + "step": 9975, + "task_loss": 0.5453804731369019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5094899535179138, + "epoch": 8.43, + "learning_rate": 7.836010143702451e-06, + "loss": 0.3741, + "step": 9976, + "task_loss": 0.7566254138946533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4013715386390686, + "epoch": 8.43, + "learning_rate": 7.831783601014371e-06, + "loss": 0.3908, + "step": 9977, + "task_loss": 1.00264310836792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33096060156822205, + "epoch": 8.43, + "learning_rate": 7.827557058326289e-06, + "loss": 0.3425, + "step": 9978, + "task_loss": 0.8430969715118408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4888285994529724, + "epoch": 8.44, + "learning_rate": 7.823330515638209e-06, + "loss": 0.4875, + "step": 9979, + "task_loss": 1.28627347946167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.535581111907959, + "epoch": 8.44, + "learning_rate": 7.819103972950127e-06, + "loss": 0.5124, + "step": 9980, + "task_loss": 0.49604836106300354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.255129337310791, + "epoch": 8.44, + "learning_rate": 7.814877430262047e-06, + "loss": 0.3107, + "step": 9981, + "task_loss": 0.24262180924415588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32798492908477783, + "epoch": 8.44, + "learning_rate": 7.810650887573965e-06, + "loss": 0.4304, + "step": 9982, + "task_loss": 0.7916689515113831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31929513812065125, + "epoch": 8.44, + "learning_rate": 7.806424344885883e-06, + "loss": 0.3273, + "step": 9983, + "task_loss": 0.6244804263114929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.286630243062973, + "epoch": 8.44, + "learning_rate": 7.802197802197802e-06, + "loss": 0.3666, + "step": 9984, + "task_loss": 0.3419410288333893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2809467315673828, + "epoch": 8.44, + "learning_rate": 7.797971259509722e-06, + "loss": 0.4896, + "step": 9985, + "task_loss": 0.09554605185985565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3604429364204407, + "epoch": 8.44, + "learning_rate": 7.79374471682164e-06, + "loss": 0.4541, + "step": 9986, + "task_loss": 0.3937378525733948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41663700342178345, + "epoch": 8.44, + "learning_rate": 7.789518174133558e-06, + "loss": 0.4006, + "step": 9987, + "task_loss": 1.0851434469223022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.291340708732605, + "epoch": 8.44, + "learning_rate": 7.785291631445478e-06, + "loss": 0.3726, + "step": 9988, + "task_loss": 1.104375958442688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5594011545181274, + "epoch": 8.44, + "learning_rate": 7.781065088757398e-06, + "loss": 0.3433, + "step": 9989, + "task_loss": 1.2137364149093628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.501724898815155, + "epoch": 8.44, + "learning_rate": 7.776838546069316e-06, + "loss": 0.3373, + "step": 9990, + "task_loss": 0.6115052700042725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.291567325592041, + "epoch": 8.45, + "learning_rate": 7.772612003381234e-06, + "loss": 0.3705, + "step": 9991, + "task_loss": 0.3286609649658203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24879682064056396, + "epoch": 8.45, + "learning_rate": 7.768385460693154e-06, + "loss": 0.3386, + "step": 9992, + "task_loss": 0.15706300735473633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23902809619903564, + "epoch": 8.45, + "learning_rate": 7.764158918005073e-06, + "loss": 0.4161, + "step": 9993, + "task_loss": 0.1779930591583252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3532010316848755, + "epoch": 8.45, + "learning_rate": 7.759932375316991e-06, + "loss": 0.4264, + "step": 9994, + "task_loss": 0.18645226955413818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49448999762535095, + "epoch": 8.45, + "learning_rate": 7.75570583262891e-06, + "loss": 0.4334, + "step": 9995, + "task_loss": 1.6293988227844238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2907274663448334, + "epoch": 8.45, + "learning_rate": 7.751479289940829e-06, + "loss": 0.349, + "step": 9996, + "task_loss": 0.24322834610939026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37505269050598145, + "epoch": 8.45, + "learning_rate": 7.747252747252747e-06, + "loss": 0.411, + "step": 9997, + "task_loss": 0.5230420827865601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4134119153022766, + "epoch": 8.45, + "learning_rate": 7.743026204564667e-06, + "loss": 0.3025, + "step": 9998, + "task_loss": 0.8234663009643555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42424696683883667, + "epoch": 8.45, + "learning_rate": 7.738799661876585e-06, + "loss": 0.4191, + "step": 9999, + "task_loss": 1.0713717937469482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3065156936645508, + "epoch": 8.45, + "learning_rate": 7.734573119188505e-06, + "loss": 0.3651, + "step": 10000, + "task_loss": 0.16009142994880676 + }, + { + "epoch": 8.45, + "eval_accuracy": 0.9165148514851486, + "eval_loss": 0.2567279040813446, + "eval_runtime": 225.5684, + "eval_samples_per_second": 111.939, + "eval_steps_per_second": 0.878, + "step": 10000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15647852420806885, + "epoch": 8.45, + "learning_rate": 7.730346576500423e-06, + "loss": 0.2385, + "step": 10001, + "task_loss": 0.3650796711444855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5962855219841003, + "epoch": 8.45, + "learning_rate": 7.726120033812342e-06, + "loss": 0.4114, + "step": 10002, + "task_loss": 0.23950029909610748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35707521438598633, + "epoch": 8.46, + "learning_rate": 7.72189349112426e-06, + "loss": 0.3885, + "step": 10003, + "task_loss": 0.5751855969429016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26414790749549866, + "epoch": 8.46, + "learning_rate": 7.717666948436179e-06, + "loss": 0.2771, + "step": 10004, + "task_loss": 0.16733896732330322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20213139057159424, + "epoch": 8.46, + "learning_rate": 7.713440405748098e-06, + "loss": 0.4707, + "step": 10005, + "task_loss": 0.26585355401039124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21714413166046143, + "epoch": 8.46, + "learning_rate": 7.709213863060018e-06, + "loss": 0.2975, + "step": 10006, + "task_loss": 0.45038193464279175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3212604224681854, + "epoch": 8.46, + "learning_rate": 7.704987320371936e-06, + "loss": 0.388, + "step": 10007, + "task_loss": 0.30243954062461853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30845028162002563, + "epoch": 8.46, + "learning_rate": 7.700760777683854e-06, + "loss": 0.4041, + "step": 10008, + "task_loss": 0.374865859746933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37851595878601074, + "epoch": 8.46, + "learning_rate": 7.696534234995774e-06, + "loss": 0.3195, + "step": 10009, + "task_loss": 0.3934018611907959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3625902235507965, + "epoch": 8.46, + "learning_rate": 7.692307692307694e-06, + "loss": 0.3909, + "step": 10010, + "task_loss": 0.5277790427207947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41369467973709106, + "epoch": 8.46, + "learning_rate": 7.688081149619612e-06, + "loss": 0.3433, + "step": 10011, + "task_loss": 0.5595895648002625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44314971566200256, + "epoch": 8.46, + "learning_rate": 7.68385460693153e-06, + "loss": 0.3874, + "step": 10012, + "task_loss": 0.7153849601745605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42098793387413025, + "epoch": 8.46, + "learning_rate": 7.67962806424345e-06, + "loss": 0.3927, + "step": 10013, + "task_loss": 1.7886065244674683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3648926615715027, + "epoch": 8.46, + "learning_rate": 7.67540152155537e-06, + "loss": 0.32, + "step": 10014, + "task_loss": 0.6187127232551575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24024340510368347, + "epoch": 8.47, + "learning_rate": 7.671174978867286e-06, + "loss": 0.433, + "step": 10015, + "task_loss": 0.5889041423797607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3975982666015625, + "epoch": 8.47, + "learning_rate": 7.666948436179205e-06, + "loss": 0.3824, + "step": 10016, + "task_loss": 0.09880679845809937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.454243928194046, + "epoch": 8.47, + "learning_rate": 7.662721893491125e-06, + "loss": 0.4064, + "step": 10017, + "task_loss": 1.050737977027893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2888142466545105, + "epoch": 8.47, + "learning_rate": 7.658495350803043e-06, + "loss": 0.3487, + "step": 10018, + "task_loss": 0.5387918949127197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23971499502658844, + "epoch": 8.47, + "learning_rate": 7.654268808114963e-06, + "loss": 0.3212, + "step": 10019, + "task_loss": 0.3706704080104828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2744256854057312, + "epoch": 8.47, + "learning_rate": 7.65004226542688e-06, + "loss": 0.3616, + "step": 10020, + "task_loss": 0.0930965319275856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3478437662124634, + "epoch": 8.47, + "learning_rate": 7.6458157227388e-06, + "loss": 0.3272, + "step": 10021, + "task_loss": 0.7598109841346741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19099076092243195, + "epoch": 8.47, + "learning_rate": 7.641589180050719e-06, + "loss": 0.2626, + "step": 10022, + "task_loss": 0.0723000094294548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30972692370414734, + "epoch": 8.47, + "learning_rate": 7.637362637362638e-06, + "loss": 0.371, + "step": 10023, + "task_loss": 1.400354266166687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4075191020965576, + "epoch": 8.47, + "learning_rate": 7.633136094674556e-06, + "loss": 0.2953, + "step": 10024, + "task_loss": 0.5419731140136719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38037964701652527, + "epoch": 8.47, + "learning_rate": 7.628909551986475e-06, + "loss": 0.4287, + "step": 10025, + "task_loss": 0.7014669179916382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6216515302658081, + "epoch": 8.47, + "learning_rate": 7.624683009298394e-06, + "loss": 0.4726, + "step": 10026, + "task_loss": 0.3705052137374878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.503270149230957, + "epoch": 8.48, + "learning_rate": 7.620456466610314e-06, + "loss": 0.3937, + "step": 10027, + "task_loss": 1.0167361497879028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3390476703643799, + "epoch": 8.48, + "learning_rate": 7.616229923922231e-06, + "loss": 0.3597, + "step": 10028, + "task_loss": 0.6971657276153564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5268529653549194, + "epoch": 8.48, + "learning_rate": 7.612003381234151e-06, + "loss": 0.4051, + "step": 10029, + "task_loss": 0.8590385317802429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21674463152885437, + "epoch": 8.48, + "learning_rate": 7.60777683854607e-06, + "loss": 0.3354, + "step": 10030, + "task_loss": 0.4922611117362976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5444332361221313, + "epoch": 8.48, + "learning_rate": 7.6035502958579895e-06, + "loss": 0.3865, + "step": 10031, + "task_loss": 0.7442688345909119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3745216131210327, + "epoch": 8.48, + "learning_rate": 7.599323753169907e-06, + "loss": 0.4549, + "step": 10032, + "task_loss": 0.4452337622642517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43300506472587585, + "epoch": 8.48, + "learning_rate": 7.595097210481826e-06, + "loss": 0.3777, + "step": 10033, + "task_loss": 0.28494003415107727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3483816981315613, + "epoch": 8.48, + "learning_rate": 7.590870667793745e-06, + "loss": 0.3578, + "step": 10034, + "task_loss": 0.3203364908695221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47657787799835205, + "epoch": 8.48, + "learning_rate": 7.586644125105664e-06, + "loss": 0.3299, + "step": 10035, + "task_loss": 0.43723198771476746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3334798812866211, + "epoch": 8.48, + "learning_rate": 7.582417582417582e-06, + "loss": 0.4036, + "step": 10036, + "task_loss": 0.727150559425354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2927389442920685, + "epoch": 8.48, + "learning_rate": 7.578191039729501e-06, + "loss": 0.3914, + "step": 10037, + "task_loss": 0.9308177828788757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.81629478931427, + "epoch": 8.48, + "learning_rate": 7.573964497041421e-06, + "loss": 0.6058, + "step": 10038, + "task_loss": 0.6574467420578003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40065625309944153, + "epoch": 8.49, + "learning_rate": 7.56973795435334e-06, + "loss": 0.3675, + "step": 10039, + "task_loss": 1.2913084030151367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4693346619606018, + "epoch": 8.49, + "learning_rate": 7.565511411665258e-06, + "loss": 0.3927, + "step": 10040, + "task_loss": 0.15798111259937286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47506117820739746, + "epoch": 8.49, + "learning_rate": 7.561284868977177e-06, + "loss": 0.3319, + "step": 10041, + "task_loss": 0.47026804089546204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3603743314743042, + "epoch": 8.49, + "learning_rate": 7.557058326289096e-06, + "loss": 0.285, + "step": 10042, + "task_loss": 0.9409157037734985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24001289904117584, + "epoch": 8.49, + "learning_rate": 7.552831783601015e-06, + "loss": 0.3443, + "step": 10043, + "task_loss": 0.6532590985298157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.558394193649292, + "epoch": 8.49, + "learning_rate": 7.5486052409129325e-06, + "loss": 0.3767, + "step": 10044, + "task_loss": 0.9818922877311707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5583081841468811, + "epoch": 8.49, + "learning_rate": 7.544378698224852e-06, + "loss": 0.4201, + "step": 10045, + "task_loss": 0.6369345784187317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38640353083610535, + "epoch": 8.49, + "learning_rate": 7.540152155536771e-06, + "loss": 0.3307, + "step": 10046, + "task_loss": 0.5349611639976501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5335134267807007, + "epoch": 8.49, + "learning_rate": 7.535925612848691e-06, + "loss": 0.4176, + "step": 10047, + "task_loss": 0.8648219108581543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28317153453826904, + "epoch": 8.49, + "learning_rate": 7.53169907016061e-06, + "loss": 0.3537, + "step": 10048, + "task_loss": 0.8494557738304138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3306199014186859, + "epoch": 8.49, + "learning_rate": 7.527472527472528e-06, + "loss": 0.3661, + "step": 10049, + "task_loss": 0.04323067143559456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2506070137023926, + "epoch": 8.5, + "learning_rate": 7.523245984784447e-06, + "loss": 0.3677, + "step": 10050, + "task_loss": 0.24322061240673065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.314693808555603, + "epoch": 8.5, + "learning_rate": 7.519019442096366e-06, + "loss": 0.3821, + "step": 10051, + "task_loss": 0.4480632543563843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28280672430992126, + "epoch": 8.5, + "learning_rate": 7.514792899408285e-06, + "loss": 0.3338, + "step": 10052, + "task_loss": 0.5980005264282227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3044377565383911, + "epoch": 8.5, + "learning_rate": 7.5105663567202025e-06, + "loss": 0.414, + "step": 10053, + "task_loss": 0.8885044455528259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36480021476745605, + "epoch": 8.5, + "learning_rate": 7.506339814032122e-06, + "loss": 0.3915, + "step": 10054, + "task_loss": 0.13071505725383759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3219485282897949, + "epoch": 8.5, + "learning_rate": 7.502113271344041e-06, + "loss": 0.3008, + "step": 10055, + "task_loss": 0.8692150115966797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22662855684757233, + "epoch": 8.5, + "learning_rate": 7.49788672865596e-06, + "loss": 0.3045, + "step": 10056, + "task_loss": 0.3828751742839813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.300873726606369, + "epoch": 8.5, + "learning_rate": 7.493660185967878e-06, + "loss": 0.344, + "step": 10057, + "task_loss": 0.4453399181365967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3980865478515625, + "epoch": 8.5, + "learning_rate": 7.489433643279797e-06, + "loss": 0.4711, + "step": 10058, + "task_loss": 0.7516759634017944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48048147559165955, + "epoch": 8.5, + "learning_rate": 7.485207100591717e-06, + "loss": 0.3139, + "step": 10059, + "task_loss": 0.9466328620910645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30237358808517456, + "epoch": 8.5, + "learning_rate": 7.480980557903636e-06, + "loss": 0.453, + "step": 10060, + "task_loss": 0.1238284558057785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3572167158126831, + "epoch": 8.5, + "learning_rate": 7.476754015215554e-06, + "loss": 0.3489, + "step": 10061, + "task_loss": 0.7669247984886169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40110349655151367, + "epoch": 8.51, + "learning_rate": 7.4725274725274726e-06, + "loss": 0.4407, + "step": 10062, + "task_loss": 0.8436221480369568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3704720437526703, + "epoch": 8.51, + "learning_rate": 7.468300929839392e-06, + "loss": 0.3647, + "step": 10063, + "task_loss": 0.4320041835308075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17296141386032104, + "epoch": 8.51, + "learning_rate": 7.464074387151311e-06, + "loss": 0.4688, + "step": 10064, + "task_loss": 0.8824068903923035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34670549631118774, + "epoch": 8.51, + "learning_rate": 7.459847844463229e-06, + "loss": 0.4117, + "step": 10065, + "task_loss": 1.4002689123153687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2216339111328125, + "epoch": 8.51, + "learning_rate": 7.455621301775148e-06, + "loss": 0.3675, + "step": 10066, + "task_loss": 0.14401815831661224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30027759075164795, + "epoch": 8.51, + "learning_rate": 7.451394759087067e-06, + "loss": 0.4478, + "step": 10067, + "task_loss": 0.7981740236282349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4449779987335205, + "epoch": 8.51, + "learning_rate": 7.447168216398987e-06, + "loss": 0.424, + "step": 10068, + "task_loss": 0.6555307507514954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38975387811660767, + "epoch": 8.51, + "learning_rate": 7.442941673710904e-06, + "loss": 0.3291, + "step": 10069, + "task_loss": 0.4789889454841614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2745557725429535, + "epoch": 8.51, + "learning_rate": 7.438715131022824e-06, + "loss": 0.2907, + "step": 10070, + "task_loss": 0.28161874413490295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2654450535774231, + "epoch": 8.51, + "learning_rate": 7.434488588334743e-06, + "loss": 0.4382, + "step": 10071, + "task_loss": 0.31980106234550476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21384823322296143, + "epoch": 8.51, + "learning_rate": 7.4302620456466615e-06, + "loss": 0.2253, + "step": 10072, + "task_loss": 0.36582908034324646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41527092456817627, + "epoch": 8.51, + "learning_rate": 7.4260355029585795e-06, + "loss": 0.48, + "step": 10073, + "task_loss": 0.47583556175231934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35797134041786194, + "epoch": 8.52, + "learning_rate": 7.421808960270498e-06, + "loss": 0.4755, + "step": 10074, + "task_loss": 1.0336451530456543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5484205484390259, + "epoch": 8.52, + "learning_rate": 7.417582417582418e-06, + "loss": 0.6047, + "step": 10075, + "task_loss": 1.4436770677566528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22701247036457062, + "epoch": 8.52, + "learning_rate": 7.413355874894337e-06, + "loss": 0.4339, + "step": 10076, + "task_loss": 0.4193384051322937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3061693608760834, + "epoch": 8.52, + "learning_rate": 7.409129332206256e-06, + "loss": 0.4007, + "step": 10077, + "task_loss": 0.4975164830684662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32204893231391907, + "epoch": 8.52, + "learning_rate": 7.404902789518174e-06, + "loss": 0.3968, + "step": 10078, + "task_loss": 0.49848952889442444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3851049840450287, + "epoch": 8.52, + "learning_rate": 7.400676246830094e-06, + "loss": 0.349, + "step": 10079, + "task_loss": 0.5062575340270996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3190443813800812, + "epoch": 8.52, + "learning_rate": 7.396449704142013e-06, + "loss": 0.3175, + "step": 10080, + "task_loss": 1.091103434562683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.278963565826416, + "epoch": 8.52, + "learning_rate": 7.3922231614539315e-06, + "loss": 0.3282, + "step": 10081, + "task_loss": 0.2635341286659241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3126611113548279, + "epoch": 8.52, + "learning_rate": 7.3879966187658495e-06, + "loss": 0.3452, + "step": 10082, + "task_loss": 0.663848876953125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41551756858825684, + "epoch": 8.52, + "learning_rate": 7.3837700760777684e-06, + "loss": 0.4359, + "step": 10083, + "task_loss": 0.37026870250701904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34381431341171265, + "epoch": 8.52, + "learning_rate": 7.379543533389688e-06, + "loss": 0.4286, + "step": 10084, + "task_loss": 1.0224076509475708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22305767238140106, + "epoch": 8.52, + "learning_rate": 7.375316990701607e-06, + "loss": 0.3554, + "step": 10085, + "task_loss": 0.10315810889005661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38133835792541504, + "epoch": 8.53, + "learning_rate": 7.371090448013525e-06, + "loss": 0.4172, + "step": 10086, + "task_loss": 0.5190756320953369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3535868525505066, + "epoch": 8.53, + "learning_rate": 7.366863905325444e-06, + "loss": 0.4403, + "step": 10087, + "task_loss": 1.0829070806503296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2347486913204193, + "epoch": 8.53, + "learning_rate": 7.362637362637363e-06, + "loss": 0.3283, + "step": 10088, + "task_loss": 0.8213214874267578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21849556267261505, + "epoch": 8.53, + "learning_rate": 7.358410819949283e-06, + "loss": 0.4049, + "step": 10089, + "task_loss": 0.8195666074752808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46921515464782715, + "epoch": 8.53, + "learning_rate": 7.3541842772612e-06, + "loss": 0.3611, + "step": 10090, + "task_loss": 0.3248871862888336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3546496331691742, + "epoch": 8.53, + "learning_rate": 7.3499577345731196e-06, + "loss": 0.3851, + "step": 10091, + "task_loss": 0.21330218017101288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19309554994106293, + "epoch": 8.53, + "learning_rate": 7.3457311918850385e-06, + "loss": 0.3724, + "step": 10092, + "task_loss": 0.10121327638626099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5482072234153748, + "epoch": 8.53, + "learning_rate": 7.341504649196957e-06, + "loss": 0.4397, + "step": 10093, + "task_loss": 1.7027041912078857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29318809509277344, + "epoch": 8.53, + "learning_rate": 7.337278106508875e-06, + "loss": 0.3756, + "step": 10094, + "task_loss": 1.165995478630066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5020350217819214, + "epoch": 8.53, + "learning_rate": 7.333051563820795e-06, + "loss": 0.3411, + "step": 10095, + "task_loss": 0.48852604627609253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2605220377445221, + "epoch": 8.53, + "learning_rate": 7.328825021132714e-06, + "loss": 0.3025, + "step": 10096, + "task_loss": 0.4516128599643707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31064432859420776, + "epoch": 8.53, + "learning_rate": 7.324598478444633e-06, + "loss": 0.406, + "step": 10097, + "task_loss": 0.7675235271453857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4627359211444855, + "epoch": 8.54, + "learning_rate": 7.320371935756551e-06, + "loss": 0.4223, + "step": 10098, + "task_loss": 1.088073968887329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23597946763038635, + "epoch": 8.54, + "learning_rate": 7.31614539306847e-06, + "loss": 0.3262, + "step": 10099, + "task_loss": 0.31622835993766785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43826401233673096, + "epoch": 8.54, + "learning_rate": 7.31191885038039e-06, + "loss": 0.4123, + "step": 10100, + "task_loss": 0.23514068126678467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4982466697692871, + "epoch": 8.54, + "learning_rate": 7.3076923076923085e-06, + "loss": 0.5006, + "step": 10101, + "task_loss": 0.4700887203216553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34766313433647156, + "epoch": 8.54, + "learning_rate": 7.3034657650042265e-06, + "loss": 0.4419, + "step": 10102, + "task_loss": 0.4832528829574585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3392738997936249, + "epoch": 8.54, + "learning_rate": 7.299239222316145e-06, + "loss": 0.4125, + "step": 10103, + "task_loss": 0.4250722825527191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29427361488342285, + "epoch": 8.54, + "learning_rate": 7.295012679628064e-06, + "loss": 0.3331, + "step": 10104, + "task_loss": 0.8826963901519775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24012696743011475, + "epoch": 8.54, + "learning_rate": 7.290786136939984e-06, + "loss": 0.3213, + "step": 10105, + "task_loss": 0.5127995014190674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28859108686447144, + "epoch": 8.54, + "learning_rate": 7.286559594251901e-06, + "loss": 0.4814, + "step": 10106, + "task_loss": 0.22149032354354858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5038437247276306, + "epoch": 8.54, + "learning_rate": 7.282333051563821e-06, + "loss": 0.4396, + "step": 10107, + "task_loss": 0.7255251407623291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.437401682138443, + "epoch": 8.54, + "learning_rate": 7.27810650887574e-06, + "loss": 0.3674, + "step": 10108, + "task_loss": 0.8912453651428223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34648585319519043, + "epoch": 8.54, + "learning_rate": 7.273879966187659e-06, + "loss": 0.3254, + "step": 10109, + "task_loss": 0.6494439244270325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4895307719707489, + "epoch": 8.55, + "learning_rate": 7.2696534234995785e-06, + "loss": 0.3736, + "step": 10110, + "task_loss": 0.7184210419654846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5675989985466003, + "epoch": 8.55, + "learning_rate": 7.2654268808114966e-06, + "loss": 0.3738, + "step": 10111, + "task_loss": 0.5886959433555603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5565800666809082, + "epoch": 8.55, + "learning_rate": 7.2612003381234154e-06, + "loss": 0.4729, + "step": 10112, + "task_loss": 0.5893681645393372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38432931900024414, + "epoch": 8.55, + "learning_rate": 7.256973795435334e-06, + "loss": 0.395, + "step": 10113, + "task_loss": 0.38745537400245667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5947434902191162, + "epoch": 8.55, + "learning_rate": 7.252747252747254e-06, + "loss": 0.4227, + "step": 10114, + "task_loss": 1.39102041721344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39933276176452637, + "epoch": 8.55, + "learning_rate": 7.248520710059171e-06, + "loss": 0.3518, + "step": 10115, + "task_loss": 1.1552631855010986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19872742891311646, + "epoch": 8.55, + "learning_rate": 7.244294167371091e-06, + "loss": 0.3189, + "step": 10116, + "task_loss": 0.47361767292022705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21623800694942474, + "epoch": 8.55, + "learning_rate": 7.24006762468301e-06, + "loss": 0.3268, + "step": 10117, + "task_loss": 0.727571964263916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19654525816440582, + "epoch": 8.55, + "learning_rate": 7.235841081994929e-06, + "loss": 0.3962, + "step": 10118, + "task_loss": 0.011826390400528908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.387768030166626, + "epoch": 8.55, + "learning_rate": 7.231614539306847e-06, + "loss": 0.3388, + "step": 10119, + "task_loss": 0.20608526468276978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3727930188179016, + "epoch": 8.55, + "learning_rate": 7.227387996618766e-06, + "loss": 0.2993, + "step": 10120, + "task_loss": 0.3519766628742218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5103909969329834, + "epoch": 8.56, + "learning_rate": 7.2231614539306855e-06, + "loss": 0.4435, + "step": 10121, + "task_loss": 0.8131870627403259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2915813624858856, + "epoch": 8.56, + "learning_rate": 7.218934911242604e-06, + "loss": 0.3638, + "step": 10122, + "task_loss": 0.4115925431251526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2068556398153305, + "epoch": 8.56, + "learning_rate": 7.214708368554522e-06, + "loss": 0.2987, + "step": 10123, + "task_loss": 1.0224032402038574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.468778133392334, + "epoch": 8.56, + "learning_rate": 7.210481825866441e-06, + "loss": 0.333, + "step": 10124, + "task_loss": 0.5400423407554626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23247352242469788, + "epoch": 8.56, + "learning_rate": 7.20625528317836e-06, + "loss": 0.2747, + "step": 10125, + "task_loss": 0.3630013167858124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27808061242103577, + "epoch": 8.56, + "learning_rate": 7.20202874049028e-06, + "loss": 0.3306, + "step": 10126, + "task_loss": 0.6817033290863037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2409682720899582, + "epoch": 8.56, + "learning_rate": 7.197802197802198e-06, + "loss": 0.3599, + "step": 10127, + "task_loss": 0.1748778074979782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.498413622379303, + "epoch": 8.56, + "learning_rate": 7.193575655114117e-06, + "loss": 0.3817, + "step": 10128, + "task_loss": 0.21646666526794434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3146403431892395, + "epoch": 8.56, + "learning_rate": 7.189349112426036e-06, + "loss": 0.3904, + "step": 10129, + "task_loss": 0.11147406697273254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48959246277809143, + "epoch": 8.56, + "learning_rate": 7.1851225697379555e-06, + "loss": 0.465, + "step": 10130, + "task_loss": 0.6751667261123657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5285665392875671, + "epoch": 8.56, + "learning_rate": 7.180896027049873e-06, + "loss": 0.4119, + "step": 10131, + "task_loss": 0.6807984709739685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.487068235874176, + "epoch": 8.56, + "learning_rate": 7.1766694843617924e-06, + "loss": 0.3189, + "step": 10132, + "task_loss": 0.868852972984314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39783355593681335, + "epoch": 8.57, + "learning_rate": 7.172442941673711e-06, + "loss": 0.3746, + "step": 10133, + "task_loss": 0.6925401091575623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2802753448486328, + "epoch": 8.57, + "learning_rate": 7.16821639898563e-06, + "loss": 0.3745, + "step": 10134, + "task_loss": 0.31753256916999817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2982405424118042, + "epoch": 8.57, + "learning_rate": 7.163989856297548e-06, + "loss": 0.4042, + "step": 10135, + "task_loss": 0.5780658721923828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31185731291770935, + "epoch": 8.57, + "learning_rate": 7.159763313609467e-06, + "loss": 0.367, + "step": 10136, + "task_loss": 0.4677661061286926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2681635022163391, + "epoch": 8.57, + "learning_rate": 7.155536770921387e-06, + "loss": 0.4478, + "step": 10137, + "task_loss": 0.3334159851074219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6161929368972778, + "epoch": 8.57, + "learning_rate": 7.151310228233306e-06, + "loss": 0.4481, + "step": 10138, + "task_loss": 1.0833162069320679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33256983757019043, + "epoch": 8.57, + "learning_rate": 7.147083685545225e-06, + "loss": 0.3505, + "step": 10139, + "task_loss": 0.48805513978004456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3421381115913391, + "epoch": 8.57, + "learning_rate": 7.142857142857143e-06, + "loss": 0.479, + "step": 10140, + "task_loss": 0.16572847962379456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24007548391819, + "epoch": 8.57, + "learning_rate": 7.138630600169062e-06, + "loss": 0.3268, + "step": 10141, + "task_loss": 0.21054160594940186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20671992003917694, + "epoch": 8.57, + "learning_rate": 7.134404057480981e-06, + "loss": 0.1864, + "step": 10142, + "task_loss": 0.22633452713489532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2945789098739624, + "epoch": 8.57, + "learning_rate": 7.1301775147929e-06, + "loss": 0.3734, + "step": 10143, + "task_loss": 1.0199781656265259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24160727858543396, + "epoch": 8.57, + "learning_rate": 7.125950972104818e-06, + "loss": 0.2778, + "step": 10144, + "task_loss": 0.14128464460372925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21019797027111053, + "epoch": 8.58, + "learning_rate": 7.121724429416737e-06, + "loss": 0.3023, + "step": 10145, + "task_loss": 0.46944016218185425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18696178495883942, + "epoch": 8.58, + "learning_rate": 7.117497886728657e-06, + "loss": 0.3682, + "step": 10146, + "task_loss": 0.18120352923870087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4178585410118103, + "epoch": 8.58, + "learning_rate": 7.113271344040576e-06, + "loss": 0.3425, + "step": 10147, + "task_loss": 0.8405928611755371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.13897255063056946, + "epoch": 8.58, + "learning_rate": 7.109044801352494e-06, + "loss": 0.3161, + "step": 10148, + "task_loss": 0.39884504675865173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4695075750350952, + "epoch": 8.58, + "learning_rate": 7.104818258664413e-06, + "loss": 0.4271, + "step": 10149, + "task_loss": 0.6146615147590637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4751514196395874, + "epoch": 8.58, + "learning_rate": 7.100591715976332e-06, + "loss": 0.3442, + "step": 10150, + "task_loss": 0.41544121503829956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2777957320213318, + "epoch": 8.58, + "learning_rate": 7.096365173288251e-06, + "loss": 0.3404, + "step": 10151, + "task_loss": 0.110404372215271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48751333355903625, + "epoch": 8.58, + "learning_rate": 7.0921386306001686e-06, + "loss": 0.422, + "step": 10152, + "task_loss": 0.20621347427368164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3942243456840515, + "epoch": 8.58, + "learning_rate": 7.087912087912088e-06, + "loss": 0.3859, + "step": 10153, + "task_loss": 0.6490058898925781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2008228600025177, + "epoch": 8.58, + "learning_rate": 7.083685545224007e-06, + "loss": 0.3257, + "step": 10154, + "task_loss": 0.05771105736494064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4592174291610718, + "epoch": 8.58, + "learning_rate": 7.079459002535926e-06, + "loss": 0.4414, + "step": 10155, + "task_loss": 0.6298792958259583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37410280108451843, + "epoch": 8.58, + "learning_rate": 7.075232459847844e-06, + "loss": 0.4035, + "step": 10156, + "task_loss": 0.5367450714111328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19028428196907043, + "epoch": 8.59, + "learning_rate": 7.071005917159763e-06, + "loss": 0.2886, + "step": 10157, + "task_loss": 0.4632282555103302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3145682215690613, + "epoch": 8.59, + "learning_rate": 7.066779374471683e-06, + "loss": 0.2593, + "step": 10158, + "task_loss": 0.26095959544181824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29716023802757263, + "epoch": 8.59, + "learning_rate": 7.062552831783602e-06, + "loss": 0.4543, + "step": 10159, + "task_loss": 0.47853729128837585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2286374717950821, + "epoch": 8.59, + "learning_rate": 7.05832628909552e-06, + "loss": 0.3965, + "step": 10160, + "task_loss": 0.5313234925270081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3271198868751526, + "epoch": 8.59, + "learning_rate": 7.054099746407439e-06, + "loss": 0.4234, + "step": 10161, + "task_loss": 0.2500517964363098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3360799252986908, + "epoch": 8.59, + "learning_rate": 7.049873203719358e-06, + "loss": 0.3892, + "step": 10162, + "task_loss": 0.6327769756317139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3912489414215088, + "epoch": 8.59, + "learning_rate": 7.045646661031277e-06, + "loss": 0.3504, + "step": 10163, + "task_loss": 1.0093562602996826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33680081367492676, + "epoch": 8.59, + "learning_rate": 7.041420118343195e-06, + "loss": 0.3318, + "step": 10164, + "task_loss": 0.3986647427082062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31090137362480164, + "epoch": 8.59, + "learning_rate": 7.037193575655114e-06, + "loss": 0.3545, + "step": 10165, + "task_loss": 0.5555381774902344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31631386280059814, + "epoch": 8.59, + "learning_rate": 7.032967032967033e-06, + "loss": 0.315, + "step": 10166, + "task_loss": 0.12015603482723236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35853099822998047, + "epoch": 8.59, + "learning_rate": 7.028740490278953e-06, + "loss": 0.3235, + "step": 10167, + "task_loss": 0.3177771270275116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3126335144042969, + "epoch": 8.59, + "learning_rate": 7.024513947590872e-06, + "loss": 0.3528, + "step": 10168, + "task_loss": 0.542525589466095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32863014936447144, + "epoch": 8.6, + "learning_rate": 7.02028740490279e-06, + "loss": 0.3917, + "step": 10169, + "task_loss": 0.47550228238105774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3473438024520874, + "epoch": 8.6, + "learning_rate": 7.016060862214709e-06, + "loss": 0.4131, + "step": 10170, + "task_loss": 1.3606388568878174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3230997323989868, + "epoch": 8.6, + "learning_rate": 7.0118343195266275e-06, + "loss": 0.3996, + "step": 10171, + "task_loss": 0.33820974826812744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2735862135887146, + "epoch": 8.6, + "learning_rate": 7.007607776838547e-06, + "loss": 0.4653, + "step": 10172, + "task_loss": 0.4941348135471344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28236478567123413, + "epoch": 8.6, + "learning_rate": 7.0033812341504644e-06, + "loss": 0.4992, + "step": 10173, + "task_loss": 0.38389989733695984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3128219246864319, + "epoch": 8.6, + "learning_rate": 6.999154691462384e-06, + "loss": 0.3308, + "step": 10174, + "task_loss": 0.6091203093528748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36828166246414185, + "epoch": 8.6, + "learning_rate": 6.994928148774303e-06, + "loss": 0.3239, + "step": 10175, + "task_loss": 0.44792771339416504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47807371616363525, + "epoch": 8.6, + "learning_rate": 6.990701606086222e-06, + "loss": 0.4251, + "step": 10176, + "task_loss": 0.5433327555656433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6082231998443604, + "epoch": 8.6, + "learning_rate": 6.98647506339814e-06, + "loss": 0.5098, + "step": 10177, + "task_loss": 0.3901612460613251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3770318031311035, + "epoch": 8.6, + "learning_rate": 6.98224852071006e-06, + "loss": 0.425, + "step": 10178, + "task_loss": 0.48897072672843933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25661683082580566, + "epoch": 8.6, + "learning_rate": 6.978021978021979e-06, + "loss": 0.4231, + "step": 10179, + "task_loss": 0.032284293323755264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2921563684940338, + "epoch": 8.6, + "learning_rate": 6.9737954353338975e-06, + "loss": 0.4419, + "step": 10180, + "task_loss": 0.6852441430091858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3942301869392395, + "epoch": 8.61, + "learning_rate": 6.9695688926458156e-06, + "loss": 0.4107, + "step": 10181, + "task_loss": 0.35574039816856384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5849505066871643, + "epoch": 8.61, + "learning_rate": 6.9653423499577345e-06, + "loss": 0.4988, + "step": 10182, + "task_loss": 0.8089973330497742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19590267539024353, + "epoch": 8.61, + "learning_rate": 6.961115807269654e-06, + "loss": 0.3233, + "step": 10183, + "task_loss": 0.08618414402008057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24896256625652313, + "epoch": 8.61, + "learning_rate": 6.956889264581573e-06, + "loss": 0.3564, + "step": 10184, + "task_loss": 0.8752989768981934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5699318647384644, + "epoch": 8.61, + "learning_rate": 6.952662721893491e-06, + "loss": 0.3834, + "step": 10185, + "task_loss": 0.4956744313240051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2537305951118469, + "epoch": 8.61, + "learning_rate": 6.94843617920541e-06, + "loss": 0.3133, + "step": 10186, + "task_loss": 0.3263333737850189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5843814611434937, + "epoch": 8.61, + "learning_rate": 6.944209636517329e-06, + "loss": 0.4289, + "step": 10187, + "task_loss": 0.4522673487663269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3710978925228119, + "epoch": 8.61, + "learning_rate": 6.939983093829249e-06, + "loss": 0.4026, + "step": 10188, + "task_loss": 1.0115045309066772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37507009506225586, + "epoch": 8.61, + "learning_rate": 6.935756551141166e-06, + "loss": 0.287, + "step": 10189, + "task_loss": 0.6558169722557068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3815043866634369, + "epoch": 8.61, + "learning_rate": 6.931530008453086e-06, + "loss": 0.3995, + "step": 10190, + "task_loss": 1.0059436559677124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6301705241203308, + "epoch": 8.61, + "learning_rate": 6.9273034657650045e-06, + "loss": 0.3305, + "step": 10191, + "task_loss": 0.4730185270309448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32997334003448486, + "epoch": 8.61, + "learning_rate": 6.923076923076923e-06, + "loss": 0.3405, + "step": 10192, + "task_loss": 0.1453944742679596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4705065190792084, + "epoch": 8.62, + "learning_rate": 6.9188503803888414e-06, + "loss": 0.4816, + "step": 10193, + "task_loss": 0.8967434763908386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.321374773979187, + "epoch": 8.62, + "learning_rate": 6.914623837700761e-06, + "loss": 0.3652, + "step": 10194, + "task_loss": 0.278384268283844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5251705646514893, + "epoch": 8.62, + "learning_rate": 6.91039729501268e-06, + "loss": 0.5152, + "step": 10195, + "task_loss": 0.4618043005466461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36276179552078247, + "epoch": 8.62, + "learning_rate": 6.906170752324599e-06, + "loss": 0.4296, + "step": 10196, + "task_loss": 0.34141767024993896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32338857650756836, + "epoch": 8.62, + "learning_rate": 6.901944209636519e-06, + "loss": 0.4033, + "step": 10197, + "task_loss": 0.3074069917201996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38566267490386963, + "epoch": 8.62, + "learning_rate": 6.897717666948436e-06, + "loss": 0.4666, + "step": 10198, + "task_loss": 0.5800369381904602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35965853929519653, + "epoch": 8.62, + "learning_rate": 6.893491124260356e-06, + "loss": 0.3819, + "step": 10199, + "task_loss": 0.3991096317768097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21260502934455872, + "epoch": 8.62, + "learning_rate": 6.8892645815722745e-06, + "loss": 0.3631, + "step": 10200, + "task_loss": 0.5688323378562927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1918761432170868, + "epoch": 8.62, + "learning_rate": 6.885038038884193e-06, + "loss": 0.2916, + "step": 10201, + "task_loss": 0.4770876467227936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2169959396123886, + "epoch": 8.62, + "learning_rate": 6.8808114961961115e-06, + "loss": 0.3478, + "step": 10202, + "task_loss": 0.8480307459831238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23685654997825623, + "epoch": 8.62, + "learning_rate": 6.87658495350803e-06, + "loss": 0.3525, + "step": 10203, + "task_loss": 0.1132311001420021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41936078667640686, + "epoch": 8.63, + "learning_rate": 6.87235841081995e-06, + "loss": 0.3639, + "step": 10204, + "task_loss": 0.4771448075771332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2836070656776428, + "epoch": 8.63, + "learning_rate": 6.868131868131869e-06, + "loss": 0.33, + "step": 10205, + "task_loss": 0.5479159355163574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20466679334640503, + "epoch": 8.63, + "learning_rate": 6.863905325443787e-06, + "loss": 0.235, + "step": 10206, + "task_loss": 0.08658955991268158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23490110039710999, + "epoch": 8.63, + "learning_rate": 6.859678782755706e-06, + "loss": 0.4458, + "step": 10207, + "task_loss": 0.36428141593933105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33005669713020325, + "epoch": 8.63, + "learning_rate": 6.855452240067625e-06, + "loss": 0.376, + "step": 10208, + "task_loss": 0.5652322173118591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3751472234725952, + "epoch": 8.63, + "learning_rate": 6.8512256973795445e-06, + "loss": 0.331, + "step": 10209, + "task_loss": 0.47269508242607117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32046210765838623, + "epoch": 8.63, + "learning_rate": 6.846999154691463e-06, + "loss": 0.4281, + "step": 10210, + "task_loss": 0.6363443732261658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6729910969734192, + "epoch": 8.63, + "learning_rate": 6.8427726120033815e-06, + "loss": 0.4304, + "step": 10211, + "task_loss": 1.261833667755127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3313552141189575, + "epoch": 8.63, + "learning_rate": 6.8385460693153e-06, + "loss": 0.4255, + "step": 10212, + "task_loss": 0.9893395304679871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2731635570526123, + "epoch": 8.63, + "learning_rate": 6.83431952662722e-06, + "loss": 0.2797, + "step": 10213, + "task_loss": 0.5395263433456421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3885250687599182, + "epoch": 8.63, + "learning_rate": 6.830092983939137e-06, + "loss": 0.3402, + "step": 10214, + "task_loss": 0.6025234460830688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3497835695743561, + "epoch": 8.63, + "learning_rate": 6.825866441251057e-06, + "loss": 0.4138, + "step": 10215, + "task_loss": 0.965369462966919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30603545904159546, + "epoch": 8.64, + "learning_rate": 6.821639898562976e-06, + "loss": 0.4118, + "step": 10216, + "task_loss": 0.7757984399795532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3154016137123108, + "epoch": 8.64, + "learning_rate": 6.817413355874895e-06, + "loss": 0.4509, + "step": 10217, + "task_loss": 1.183670997619629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.501570999622345, + "epoch": 8.64, + "learning_rate": 6.813186813186813e-06, + "loss": 0.3618, + "step": 10218, + "task_loss": 0.5592185854911804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1721823811531067, + "epoch": 8.64, + "learning_rate": 6.808960270498732e-06, + "loss": 0.3182, + "step": 10219, + "task_loss": 0.014799512922763824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2652093470096588, + "epoch": 8.64, + "learning_rate": 6.8047337278106515e-06, + "loss": 0.3558, + "step": 10220, + "task_loss": 0.7648015022277832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4824661910533905, + "epoch": 8.64, + "learning_rate": 6.80050718512257e-06, + "loss": 0.3913, + "step": 10221, + "task_loss": 1.0660936832427979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37593889236450195, + "epoch": 8.64, + "learning_rate": 6.7962806424344884e-06, + "loss": 0.4496, + "step": 10222, + "task_loss": 0.5146792531013489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2139626443386078, + "epoch": 8.64, + "learning_rate": 6.792054099746407e-06, + "loss": 0.3522, + "step": 10223, + "task_loss": 0.42508557438850403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.10494861751794815, + "epoch": 8.64, + "learning_rate": 6.787827557058326e-06, + "loss": 0.249, + "step": 10224, + "task_loss": 0.009125777520239353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3527168333530426, + "epoch": 8.64, + "learning_rate": 6.783601014370246e-06, + "loss": 0.3523, + "step": 10225, + "task_loss": 0.14795461297035217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20544445514678955, + "epoch": 8.64, + "learning_rate": 6.779374471682165e-06, + "loss": 0.3696, + "step": 10226, + "task_loss": 0.18496811389923096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31022948026657104, + "epoch": 8.64, + "learning_rate": 6.775147928994083e-06, + "loss": 0.3727, + "step": 10227, + "task_loss": 0.8266865611076355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42576247453689575, + "epoch": 8.65, + "learning_rate": 6.770921386306002e-06, + "loss": 0.3534, + "step": 10228, + "task_loss": 0.3835059404373169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2530650794506073, + "epoch": 8.65, + "learning_rate": 6.7666948436179215e-06, + "loss": 0.3331, + "step": 10229, + "task_loss": 0.32775741815567017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49093538522720337, + "epoch": 8.65, + "learning_rate": 6.76246830092984e-06, + "loss": 0.4462, + "step": 10230, + "task_loss": 0.4900144040584564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.509456992149353, + "epoch": 8.65, + "learning_rate": 6.7582417582417585e-06, + "loss": 0.4056, + "step": 10231, + "task_loss": 0.5201508402824402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14230309426784515, + "epoch": 8.65, + "learning_rate": 6.754015215553677e-06, + "loss": 0.3252, + "step": 10232, + "task_loss": 0.36747416853904724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40386271476745605, + "epoch": 8.65, + "learning_rate": 6.749788672865596e-06, + "loss": 0.4036, + "step": 10233, + "task_loss": 0.6796191334724426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5331763625144958, + "epoch": 8.65, + "learning_rate": 6.745562130177516e-06, + "loss": 0.4447, + "step": 10234, + "task_loss": 0.568722128868103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38653236627578735, + "epoch": 8.65, + "learning_rate": 6.741335587489433e-06, + "loss": 0.4303, + "step": 10235, + "task_loss": 1.2374249696731567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2432292401790619, + "epoch": 8.65, + "learning_rate": 6.737109044801353e-06, + "loss": 0.3554, + "step": 10236, + "task_loss": 0.8909711837768555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38222235441207886, + "epoch": 8.65, + "learning_rate": 6.732882502113272e-06, + "loss": 0.3212, + "step": 10237, + "task_loss": 0.07854944467544556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3896022439002991, + "epoch": 8.65, + "learning_rate": 6.728655959425191e-06, + "loss": 0.3791, + "step": 10238, + "task_loss": 0.8703241348266602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26854413747787476, + "epoch": 8.65, + "learning_rate": 6.724429416737109e-06, + "loss": 0.4316, + "step": 10239, + "task_loss": 0.8514972925186157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2054835706949234, + "epoch": 8.66, + "learning_rate": 6.720202874049028e-06, + "loss": 0.4011, + "step": 10240, + "task_loss": 1.0571436882019043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4221191108226776, + "epoch": 8.66, + "learning_rate": 6.715976331360947e-06, + "loss": 0.3571, + "step": 10241, + "task_loss": 0.048402491956949234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19214095175266266, + "epoch": 8.66, + "learning_rate": 6.711749788672866e-06, + "loss": 0.2677, + "step": 10242, + "task_loss": 0.14545761048793793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2103838324546814, + "epoch": 8.66, + "learning_rate": 6.707523245984784e-06, + "loss": 0.3149, + "step": 10243, + "task_loss": 0.058164045214653015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34784746170043945, + "epoch": 8.66, + "learning_rate": 6.703296703296703e-06, + "loss": 0.4367, + "step": 10244, + "task_loss": 0.32112735509872437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.465974897146225, + "epoch": 8.66, + "learning_rate": 6.699070160608623e-06, + "loss": 0.346, + "step": 10245, + "task_loss": 0.41615378856658936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.319076806306839, + "epoch": 8.66, + "learning_rate": 6.694843617920542e-06, + "loss": 0.4378, + "step": 10246, + "task_loss": 0.4854883849620819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36962783336639404, + "epoch": 8.66, + "learning_rate": 6.69061707523246e-06, + "loss": 0.5306, + "step": 10247, + "task_loss": 1.4593161344528198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44404715299606323, + "epoch": 8.66, + "learning_rate": 6.686390532544379e-06, + "loss": 0.4139, + "step": 10248, + "task_loss": 0.8341437578201294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23270398378372192, + "epoch": 8.66, + "learning_rate": 6.682163989856298e-06, + "loss": 0.3516, + "step": 10249, + "task_loss": 0.2919387221336365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4469876289367676, + "epoch": 8.66, + "learning_rate": 6.677937447168217e-06, + "loss": 0.3963, + "step": 10250, + "task_loss": 1.6100282669067383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37517040967941284, + "epoch": 8.66, + "learning_rate": 6.673710904480135e-06, + "loss": 0.3873, + "step": 10251, + "task_loss": 0.40287190675735474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40515023469924927, + "epoch": 8.67, + "learning_rate": 6.669484361792054e-06, + "loss": 0.3603, + "step": 10252, + "task_loss": 0.9230337738990784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5311192274093628, + "epoch": 8.67, + "learning_rate": 6.665257819103973e-06, + "loss": 0.4707, + "step": 10253, + "task_loss": 1.1787490844726562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2533622086048126, + "epoch": 8.67, + "learning_rate": 6.661031276415892e-06, + "loss": 0.3364, + "step": 10254, + "task_loss": 0.40570521354675293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3649572432041168, + "epoch": 8.67, + "learning_rate": 6.656804733727812e-06, + "loss": 0.3396, + "step": 10255, + "task_loss": 0.5936547517776489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4385209083557129, + "epoch": 8.67, + "learning_rate": 6.652578191039729e-06, + "loss": 0.3883, + "step": 10256, + "task_loss": 0.09599509835243225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2012832760810852, + "epoch": 8.67, + "learning_rate": 6.648351648351649e-06, + "loss": 0.3553, + "step": 10257, + "task_loss": 0.5136467218399048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2455396205186844, + "epoch": 8.67, + "learning_rate": 6.644125105663568e-06, + "loss": 0.4163, + "step": 10258, + "task_loss": 0.25033038854599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33249711990356445, + "epoch": 8.67, + "learning_rate": 6.639898562975487e-06, + "loss": 0.3978, + "step": 10259, + "task_loss": 0.16501210629940033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23528704047203064, + "epoch": 8.67, + "learning_rate": 6.635672020287405e-06, + "loss": 0.37, + "step": 10260, + "task_loss": 0.46407756209373474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4347486197948456, + "epoch": 8.67, + "learning_rate": 6.631445477599324e-06, + "loss": 0.3962, + "step": 10261, + "task_loss": 0.3760209083557129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34073951840400696, + "epoch": 8.67, + "learning_rate": 6.627218934911243e-06, + "loss": 0.3091, + "step": 10262, + "task_loss": 0.38485804200172424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43871307373046875, + "epoch": 8.67, + "learning_rate": 6.622992392223162e-06, + "loss": 0.3148, + "step": 10263, + "task_loss": 0.5221733450889587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3766127824783325, + "epoch": 8.68, + "learning_rate": 6.61876584953508e-06, + "loss": 0.3203, + "step": 10264, + "task_loss": 0.6137000322341919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5030971169471741, + "epoch": 8.68, + "learning_rate": 6.614539306846999e-06, + "loss": 0.3669, + "step": 10265, + "task_loss": 0.5914580821990967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.273930162191391, + "epoch": 8.68, + "learning_rate": 6.610312764158919e-06, + "loss": 0.312, + "step": 10266, + "task_loss": 0.5968537330627441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37613046169281006, + "epoch": 8.68, + "learning_rate": 6.606086221470838e-06, + "loss": 0.4175, + "step": 10267, + "task_loss": 1.101049542427063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3683258295059204, + "epoch": 8.68, + "learning_rate": 6.601859678782756e-06, + "loss": 0.3889, + "step": 10268, + "task_loss": 1.4945214986801147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1301913559436798, + "epoch": 8.68, + "learning_rate": 6.597633136094675e-06, + "loss": 0.2717, + "step": 10269, + "task_loss": 0.3503168821334839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28418534994125366, + "epoch": 8.68, + "learning_rate": 6.5934065934065935e-06, + "loss": 0.4259, + "step": 10270, + "task_loss": 1.119240641593933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3358052968978882, + "epoch": 8.68, + "learning_rate": 6.589180050718513e-06, + "loss": 0.3474, + "step": 10271, + "task_loss": 1.6885454654693604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2359154224395752, + "epoch": 8.68, + "learning_rate": 6.5849535080304305e-06, + "loss": 0.293, + "step": 10272, + "task_loss": 0.2343103289604187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41343820095062256, + "epoch": 8.68, + "learning_rate": 6.58072696534235e-06, + "loss": 0.4545, + "step": 10273, + "task_loss": 1.726901650428772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35470426082611084, + "epoch": 8.68, + "learning_rate": 6.576500422654269e-06, + "loss": 0.3373, + "step": 10274, + "task_loss": 0.7061622142791748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3992096483707428, + "epoch": 8.69, + "learning_rate": 6.572273879966189e-06, + "loss": 0.4136, + "step": 10275, + "task_loss": 0.9523466229438782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4088340997695923, + "epoch": 8.69, + "learning_rate": 6.568047337278106e-06, + "loss": 0.3615, + "step": 10276, + "task_loss": 0.15348869562149048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49036648869514465, + "epoch": 8.69, + "learning_rate": 6.563820794590026e-06, + "loss": 0.4048, + "step": 10277, + "task_loss": 0.8995429277420044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2457422912120819, + "epoch": 8.69, + "learning_rate": 6.559594251901945e-06, + "loss": 0.3209, + "step": 10278, + "task_loss": 0.1123863011598587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.608359694480896, + "epoch": 8.69, + "learning_rate": 6.5553677092138636e-06, + "loss": 0.3883, + "step": 10279, + "task_loss": 0.5932134985923767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3319382071495056, + "epoch": 8.69, + "learning_rate": 6.551141166525782e-06, + "loss": 0.415, + "step": 10280, + "task_loss": 1.541925311088562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40180307626724243, + "epoch": 8.69, + "learning_rate": 6.5469146238377005e-06, + "loss": 0.3604, + "step": 10281, + "task_loss": 0.6386128664016724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3587697148323059, + "epoch": 8.69, + "learning_rate": 6.54268808114962e-06, + "loss": 0.3047, + "step": 10282, + "task_loss": 0.2079063355922699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4067291021347046, + "epoch": 8.69, + "learning_rate": 6.538461538461539e-06, + "loss": 0.3765, + "step": 10283, + "task_loss": 1.3953568935394287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5291969776153564, + "epoch": 8.69, + "learning_rate": 6.534234995773457e-06, + "loss": 0.328, + "step": 10284, + "task_loss": 0.4474439024925232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29968076944351196, + "epoch": 8.69, + "learning_rate": 6.530008453085376e-06, + "loss": 0.3827, + "step": 10285, + "task_loss": 0.33003607392311096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4192046821117401, + "epoch": 8.69, + "learning_rate": 6.525781910397295e-06, + "loss": 0.3447, + "step": 10286, + "task_loss": 0.18392977118492126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5222430229187012, + "epoch": 8.7, + "learning_rate": 6.521555367709215e-06, + "loss": 0.4146, + "step": 10287, + "task_loss": 0.4148479402065277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5136749744415283, + "epoch": 8.7, + "learning_rate": 6.517328825021134e-06, + "loss": 0.4243, + "step": 10288, + "task_loss": 0.1982458084821701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2106117159128189, + "epoch": 8.7, + "learning_rate": 6.513102282333052e-06, + "loss": 0.3634, + "step": 10289, + "task_loss": 0.547339141368866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19032999873161316, + "epoch": 8.7, + "learning_rate": 6.5088757396449705e-06, + "loss": 0.3117, + "step": 10290, + "task_loss": 0.3296424448490143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4238342344760895, + "epoch": 8.7, + "learning_rate": 6.50464919695689e-06, + "loss": 0.4091, + "step": 10291, + "task_loss": 1.1447417736053467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3265095055103302, + "epoch": 8.7, + "learning_rate": 6.500422654268809e-06, + "loss": 0.3969, + "step": 10292, + "task_loss": 1.3084027767181396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2999575138092041, + "epoch": 8.7, + "learning_rate": 6.496196111580727e-06, + "loss": 0.3973, + "step": 10293, + "task_loss": 0.25465816259384155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27479398250579834, + "epoch": 8.7, + "learning_rate": 6.491969568892646e-06, + "loss": 0.6045, + "step": 10294, + "task_loss": 0.04663139581680298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33985111117362976, + "epoch": 8.7, + "learning_rate": 6.487743026204565e-06, + "loss": 0.347, + "step": 10295, + "task_loss": 0.055796485394239426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2140342742204666, + "epoch": 8.7, + "learning_rate": 6.483516483516485e-06, + "loss": 0.4385, + "step": 10296, + "task_loss": 0.30671241879463196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18891829252243042, + "epoch": 8.7, + "learning_rate": 6.479289940828402e-06, + "loss": 0.4616, + "step": 10297, + "task_loss": 0.2002723664045334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4374452829360962, + "epoch": 8.7, + "learning_rate": 6.475063398140322e-06, + "loss": 0.3894, + "step": 10298, + "task_loss": 0.6016706228256226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4171823561191559, + "epoch": 8.71, + "learning_rate": 6.4708368554522405e-06, + "loss": 0.4266, + "step": 10299, + "task_loss": 0.2055545300245285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3659719228744507, + "epoch": 8.71, + "learning_rate": 6.4666103127641594e-06, + "loss": 0.3899, + "step": 10300, + "task_loss": 0.32210463285446167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42170950770378113, + "epoch": 8.71, + "learning_rate": 6.4623837700760775e-06, + "loss": 0.46, + "step": 10301, + "task_loss": 0.3794301748275757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4294230341911316, + "epoch": 8.71, + "learning_rate": 6.458157227387996e-06, + "loss": 0.4792, + "step": 10302, + "task_loss": 0.5052094459533691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15707087516784668, + "epoch": 8.71, + "learning_rate": 6.453930684699916e-06, + "loss": 0.372, + "step": 10303, + "task_loss": 0.6095816493034363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2863939702510834, + "epoch": 8.71, + "learning_rate": 6.449704142011835e-06, + "loss": 0.3562, + "step": 10304, + "task_loss": 0.2628788948059082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8933894038200378, + "epoch": 8.71, + "learning_rate": 6.445477599323753e-06, + "loss": 0.4605, + "step": 10305, + "task_loss": 0.8263062834739685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5210189819335938, + "epoch": 8.71, + "learning_rate": 6.441251056635672e-06, + "loss": 0.4346, + "step": 10306, + "task_loss": 0.9245979189872742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31072527170181274, + "epoch": 8.71, + "learning_rate": 6.437024513947592e-06, + "loss": 0.3562, + "step": 10307, + "task_loss": 0.4490794241428375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.328745573759079, + "epoch": 8.71, + "learning_rate": 6.4327979712595106e-06, + "loss": 0.4424, + "step": 10308, + "task_loss": 0.3117794096469879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5559351444244385, + "epoch": 8.71, + "learning_rate": 6.428571428571429e-06, + "loss": 0.4462, + "step": 10309, + "task_loss": 0.3710276484489441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2957094609737396, + "epoch": 8.71, + "learning_rate": 6.4243448858833475e-06, + "loss": 0.3853, + "step": 10310, + "task_loss": 0.8715615272521973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3268918991088867, + "epoch": 8.72, + "learning_rate": 6.420118343195266e-06, + "loss": 0.3275, + "step": 10311, + "task_loss": 1.5487751960754395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23484739661216736, + "epoch": 8.72, + "learning_rate": 6.415891800507186e-06, + "loss": 0.3548, + "step": 10312, + "task_loss": 1.1772838830947876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23608151078224182, + "epoch": 8.72, + "learning_rate": 6.411665257819103e-06, + "loss": 0.3252, + "step": 10313, + "task_loss": 0.24813468754291534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6550939083099365, + "epoch": 8.72, + "learning_rate": 6.407438715131023e-06, + "loss": 0.4038, + "step": 10314, + "task_loss": 0.4283565282821655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44849255681037903, + "epoch": 8.72, + "learning_rate": 6.403212172442942e-06, + "loss": 0.3482, + "step": 10315, + "task_loss": 1.8860489130020142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.460258424282074, + "epoch": 8.72, + "learning_rate": 6.398985629754861e-06, + "loss": 0.3739, + "step": 10316, + "task_loss": 0.12443460524082184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1617927998304367, + "epoch": 8.72, + "learning_rate": 6.394759087066781e-06, + "loss": 0.3285, + "step": 10317, + "task_loss": 0.8608511686325073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48226794600486755, + "epoch": 8.72, + "learning_rate": 6.390532544378698e-06, + "loss": 0.4172, + "step": 10318, + "task_loss": 0.10933075845241547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4284980893135071, + "epoch": 8.72, + "learning_rate": 6.3863060016906175e-06, + "loss": 0.3137, + "step": 10319, + "task_loss": 0.806400716304779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3165992498397827, + "epoch": 8.72, + "learning_rate": 6.382079459002536e-06, + "loss": 0.4303, + "step": 10320, + "task_loss": 0.4945444166660309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19129790365695953, + "epoch": 8.72, + "learning_rate": 6.377852916314455e-06, + "loss": 0.4069, + "step": 10321, + "task_loss": 0.6050539016723633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25220373272895813, + "epoch": 8.72, + "learning_rate": 6.373626373626373e-06, + "loss": 0.4491, + "step": 10322, + "task_loss": 0.6629496216773987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5800293684005737, + "epoch": 8.73, + "learning_rate": 6.369399830938293e-06, + "loss": 0.4867, + "step": 10323, + "task_loss": 0.20879922807216644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2251797467470169, + "epoch": 8.73, + "learning_rate": 6.365173288250212e-06, + "loss": 0.35, + "step": 10324, + "task_loss": 0.21417711675167084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28720808029174805, + "epoch": 8.73, + "learning_rate": 6.360946745562131e-06, + "loss": 0.3782, + "step": 10325, + "task_loss": 0.2106892168521881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33009570837020874, + "epoch": 8.73, + "learning_rate": 6.356720202874049e-06, + "loss": 0.4472, + "step": 10326, + "task_loss": 0.30901291966438293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32114356756210327, + "epoch": 8.73, + "learning_rate": 6.352493660185968e-06, + "loss": 0.4017, + "step": 10327, + "task_loss": 1.3044452667236328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38691216707229614, + "epoch": 8.73, + "learning_rate": 6.3482671174978876e-06, + "loss": 0.5121, + "step": 10328, + "task_loss": 0.8568069934844971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18394160270690918, + "epoch": 8.73, + "learning_rate": 6.3440405748098064e-06, + "loss": 0.2875, + "step": 10329, + "task_loss": 0.266407310962677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3543054759502411, + "epoch": 8.73, + "learning_rate": 6.3398140321217245e-06, + "loss": 0.3458, + "step": 10330, + "task_loss": 1.2154816389083862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3551786243915558, + "epoch": 8.73, + "learning_rate": 6.335587489433643e-06, + "loss": 0.34, + "step": 10331, + "task_loss": 0.37037405371665955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.13606339693069458, + "epoch": 8.73, + "learning_rate": 6.331360946745562e-06, + "loss": 0.389, + "step": 10332, + "task_loss": 0.3315262198448181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.192510724067688, + "epoch": 8.73, + "learning_rate": 6.327134404057482e-06, + "loss": 0.305, + "step": 10333, + "task_loss": 0.5704902410507202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37080585956573486, + "epoch": 8.73, + "learning_rate": 6.322907861369399e-06, + "loss": 0.444, + "step": 10334, + "task_loss": 0.12751515209674835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19843831658363342, + "epoch": 8.74, + "learning_rate": 6.318681318681319e-06, + "loss": 0.3087, + "step": 10335, + "task_loss": 0.05416838079690933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3216116428375244, + "epoch": 8.74, + "learning_rate": 6.314454775993238e-06, + "loss": 0.363, + "step": 10336, + "task_loss": 0.8008543252944946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6660699844360352, + "epoch": 8.74, + "learning_rate": 6.310228233305157e-06, + "loss": 0.5347, + "step": 10337, + "task_loss": 0.9633574485778809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37600424885749817, + "epoch": 8.74, + "learning_rate": 6.306001690617075e-06, + "loss": 0.3272, + "step": 10338, + "task_loss": 0.7214406132698059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3360324203968048, + "epoch": 8.74, + "learning_rate": 6.3017751479289945e-06, + "loss": 0.372, + "step": 10339, + "task_loss": 1.4750044345855713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2529590427875519, + "epoch": 8.74, + "learning_rate": 6.297548605240913e-06, + "loss": 0.3516, + "step": 10340, + "task_loss": 0.7506409883499146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3550662398338318, + "epoch": 8.74, + "learning_rate": 6.293322062552832e-06, + "loss": 0.3156, + "step": 10341, + "task_loss": 0.5594898462295532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3803847134113312, + "epoch": 8.74, + "learning_rate": 6.28909551986475e-06, + "loss": 0.3541, + "step": 10342, + "task_loss": 0.5479049682617188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3628426492214203, + "epoch": 8.74, + "learning_rate": 6.284868977176669e-06, + "loss": 0.4476, + "step": 10343, + "task_loss": 0.8604093194007874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46164005994796753, + "epoch": 8.74, + "learning_rate": 6.280642434488589e-06, + "loss": 0.464, + "step": 10344, + "task_loss": 0.8634071350097656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27743399143218994, + "epoch": 8.74, + "learning_rate": 6.276415891800508e-06, + "loss": 0.3853, + "step": 10345, + "task_loss": 0.3662146031856537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4610726833343506, + "epoch": 8.75, + "learning_rate": 6.272189349112427e-06, + "loss": 0.4525, + "step": 10346, + "task_loss": 0.4866326153278351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39394712448120117, + "epoch": 8.75, + "learning_rate": 6.267962806424345e-06, + "loss": 0.4394, + "step": 10347, + "task_loss": 0.4040084183216095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3919912874698639, + "epoch": 8.75, + "learning_rate": 6.263736263736264e-06, + "loss": 0.3861, + "step": 10348, + "task_loss": 0.6110310554504395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2109462171792984, + "epoch": 8.75, + "learning_rate": 6.2595097210481834e-06, + "loss": 0.3792, + "step": 10349, + "task_loss": 0.5261234641075134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1590849906206131, + "epoch": 8.75, + "learning_rate": 6.255283178360102e-06, + "loss": 0.3519, + "step": 10350, + "task_loss": 0.09601344168186188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3943822383880615, + "epoch": 8.75, + "learning_rate": 6.25105663567202e-06, + "loss": 0.4158, + "step": 10351, + "task_loss": 0.5499534606933594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3036785423755646, + "epoch": 8.75, + "learning_rate": 6.246830092983939e-06, + "loss": 0.3511, + "step": 10352, + "task_loss": 0.7557385563850403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3434774577617645, + "epoch": 8.75, + "learning_rate": 6.242603550295858e-06, + "loss": 0.3887, + "step": 10353, + "task_loss": 0.920377790927887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4558567702770233, + "epoch": 8.75, + "learning_rate": 6.238377007607777e-06, + "loss": 0.3364, + "step": 10354, + "task_loss": 0.7366123199462891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3226267099380493, + "epoch": 8.75, + "learning_rate": 6.234150464919696e-06, + "loss": 0.3676, + "step": 10355, + "task_loss": 0.41210582852363586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24346661567687988, + "epoch": 8.75, + "learning_rate": 6.229923922231615e-06, + "loss": 0.4562, + "step": 10356, + "task_loss": 0.48821502923965454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2848057746887207, + "epoch": 8.75, + "learning_rate": 6.225697379543534e-06, + "loss": 0.3695, + "step": 10357, + "task_loss": 0.4714357852935791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.163316547870636, + "epoch": 8.76, + "learning_rate": 6.221470836855453e-06, + "loss": 0.2855, + "step": 10358, + "task_loss": 0.11252571642398834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3133004307746887, + "epoch": 8.76, + "learning_rate": 6.2172442941673715e-06, + "loss": 0.3785, + "step": 10359, + "task_loss": 0.7690437436103821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5922303199768066, + "epoch": 8.76, + "learning_rate": 6.21301775147929e-06, + "loss": 0.3852, + "step": 10360, + "task_loss": 0.6204506754875183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2429821938276291, + "epoch": 8.76, + "learning_rate": 6.208791208791209e-06, + "loss": 0.4169, + "step": 10361, + "task_loss": 0.4288635551929474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7432807683944702, + "epoch": 8.76, + "learning_rate": 6.204564666103127e-06, + "loss": 0.5201, + "step": 10362, + "task_loss": 1.7511721849441528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35916954278945923, + "epoch": 8.76, + "learning_rate": 6.200338123415047e-06, + "loss": 0.3256, + "step": 10363, + "task_loss": 0.1071631908416748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2498967945575714, + "epoch": 8.76, + "learning_rate": 6.196111580726965e-06, + "loss": 0.3135, + "step": 10364, + "task_loss": 0.4970898926258087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2621099352836609, + "epoch": 8.76, + "learning_rate": 6.191885038038885e-06, + "loss": 0.4896, + "step": 10365, + "task_loss": 0.5285915732383728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2422584593296051, + "epoch": 8.76, + "learning_rate": 6.187658495350803e-06, + "loss": 0.2685, + "step": 10366, + "task_loss": 0.34828007221221924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2433885931968689, + "epoch": 8.76, + "learning_rate": 6.183431952662723e-06, + "loss": 0.2831, + "step": 10367, + "task_loss": 0.18364499509334564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23117919266223907, + "epoch": 8.76, + "learning_rate": 6.179205409974641e-06, + "loss": 0.404, + "step": 10368, + "task_loss": 0.15573401749134064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3647945821285248, + "epoch": 8.76, + "learning_rate": 6.1749788672865596e-06, + "loss": 0.4435, + "step": 10369, + "task_loss": 0.9030464887619019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3593463897705078, + "epoch": 8.77, + "learning_rate": 6.1707523245984785e-06, + "loss": 0.2983, + "step": 10370, + "task_loss": 0.36978694796562195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21983423829078674, + "epoch": 8.77, + "learning_rate": 6.166525781910397e-06, + "loss": 0.4046, + "step": 10371, + "task_loss": 0.43657055497169495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22845976054668427, + "epoch": 8.77, + "learning_rate": 6.162299239222316e-06, + "loss": 0.3491, + "step": 10372, + "task_loss": 0.0687982365489006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5453740358352661, + "epoch": 8.77, + "learning_rate": 6.158072696534235e-06, + "loss": 0.3852, + "step": 10373, + "task_loss": 0.9454341530799866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21767321228981018, + "epoch": 8.77, + "learning_rate": 6.153846153846155e-06, + "loss": 0.3012, + "step": 10374, + "task_loss": 0.17494787275791168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49846434593200684, + "epoch": 8.77, + "learning_rate": 6.149619611158073e-06, + "loss": 0.4199, + "step": 10375, + "task_loss": 0.2116880714893341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42876359820365906, + "epoch": 8.77, + "learning_rate": 6.145393068469992e-06, + "loss": 0.3657, + "step": 10376, + "task_loss": 0.49407392740249634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31646621227264404, + "epoch": 8.77, + "learning_rate": 6.141166525781911e-06, + "loss": 0.3802, + "step": 10377, + "task_loss": 0.40297001600265503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2238287329673767, + "epoch": 8.77, + "learning_rate": 6.13693998309383e-06, + "loss": 0.3519, + "step": 10378, + "task_loss": 0.6684016585350037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3505731225013733, + "epoch": 8.77, + "learning_rate": 6.1327134404057485e-06, + "loss": 0.4345, + "step": 10379, + "task_loss": 0.5631243586540222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.349264532327652, + "epoch": 8.77, + "learning_rate": 6.128486897717667e-06, + "loss": 0.3689, + "step": 10380, + "task_loss": 0.9389679431915283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7556191682815552, + "epoch": 8.77, + "learning_rate": 6.124260355029586e-06, + "loss": 0.3926, + "step": 10381, + "task_loss": 0.11405347287654877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28499191999435425, + "epoch": 8.78, + "learning_rate": 6.120033812341505e-06, + "loss": 0.2374, + "step": 10382, + "task_loss": 0.32449033856391907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31979113817214966, + "epoch": 8.78, + "learning_rate": 6.115807269653424e-06, + "loss": 0.2905, + "step": 10383, + "task_loss": 0.44297459721565247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23574215173721313, + "epoch": 8.78, + "learning_rate": 6.111580726965343e-06, + "loss": 0.3739, + "step": 10384, + "task_loss": 0.4286256730556488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.10998722910881042, + "epoch": 8.78, + "learning_rate": 6.107354184277261e-06, + "loss": 0.3008, + "step": 10385, + "task_loss": 0.13414372503757477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3486727476119995, + "epoch": 8.78, + "learning_rate": 6.103127641589181e-06, + "loss": 0.4329, + "step": 10386, + "task_loss": 0.5354213714599609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3342595398426056, + "epoch": 8.78, + "learning_rate": 6.098901098901099e-06, + "loss": 0.4216, + "step": 10387, + "task_loss": 0.8670530319213867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21947066485881805, + "epoch": 8.78, + "learning_rate": 6.0946745562130185e-06, + "loss": 0.3637, + "step": 10388, + "task_loss": 0.045155420899391174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2350551187992096, + "epoch": 8.78, + "learning_rate": 6.0904480135249366e-06, + "loss": 0.4289, + "step": 10389, + "task_loss": 0.4281821846961975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2293834537267685, + "epoch": 8.78, + "learning_rate": 6.086221470836856e-06, + "loss": 0.4152, + "step": 10390, + "task_loss": 0.5405852198600769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.315999835729599, + "epoch": 8.78, + "learning_rate": 6.081994928148774e-06, + "loss": 0.4184, + "step": 10391, + "task_loss": 0.5679742693901062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.483694463968277, + "epoch": 8.78, + "learning_rate": 6.077768385460693e-06, + "loss": 0.3786, + "step": 10392, + "task_loss": 0.7007626295089722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3256436586380005, + "epoch": 8.78, + "learning_rate": 6.073541842772612e-06, + "loss": 0.4161, + "step": 10393, + "task_loss": 0.24970579147338867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4905927777290344, + "epoch": 8.79, + "learning_rate": 6.069315300084531e-06, + "loss": 0.3822, + "step": 10394, + "task_loss": 0.5679907202720642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2644203305244446, + "epoch": 8.79, + "learning_rate": 6.06508875739645e-06, + "loss": 0.284, + "step": 10395, + "task_loss": 0.6836908459663391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2712416648864746, + "epoch": 8.79, + "learning_rate": 6.060862214708369e-06, + "loss": 0.263, + "step": 10396, + "task_loss": 0.4341779053211212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2337079793214798, + "epoch": 8.79, + "learning_rate": 6.056635672020288e-06, + "loss": 0.4999, + "step": 10397, + "task_loss": 0.25437474250793457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3711598515510559, + "epoch": 8.79, + "learning_rate": 6.0524091293322066e-06, + "loss": 0.3757, + "step": 10398, + "task_loss": 0.3741176426410675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24600717425346375, + "epoch": 8.79, + "learning_rate": 6.0481825866441255e-06, + "loss": 0.3009, + "step": 10399, + "task_loss": 0.30469605326652527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2980375587940216, + "epoch": 8.79, + "learning_rate": 6.043956043956044e-06, + "loss": 0.2578, + "step": 10400, + "task_loss": 0.5250120162963867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2616725265979767, + "epoch": 8.79, + "learning_rate": 6.039729501267962e-06, + "loss": 0.3155, + "step": 10401, + "task_loss": 0.618898332118988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43085891008377075, + "epoch": 8.79, + "learning_rate": 6.035502958579882e-06, + "loss": 0.3464, + "step": 10402, + "task_loss": 0.5121468305587769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3438425660133362, + "epoch": 8.79, + "learning_rate": 6.0312764158918e-06, + "loss": 0.3369, + "step": 10403, + "task_loss": 1.300254225730896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.411079466342926, + "epoch": 8.79, + "learning_rate": 6.02704987320372e-06, + "loss": 0.3596, + "step": 10404, + "task_loss": 0.8620886206626892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4768844246864319, + "epoch": 8.79, + "learning_rate": 6.022823330515639e-06, + "loss": 0.4165, + "step": 10405, + "task_loss": 0.711778461933136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31298163533210754, + "epoch": 8.8, + "learning_rate": 6.018596787827558e-06, + "loss": 0.3742, + "step": 10406, + "task_loss": 1.1239935159683228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33938372135162354, + "epoch": 8.8, + "learning_rate": 6.014370245139477e-06, + "loss": 0.3388, + "step": 10407, + "task_loss": 0.645356297492981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17186978459358215, + "epoch": 8.8, + "learning_rate": 6.010143702451395e-06, + "loss": 0.2685, + "step": 10408, + "task_loss": 0.03959939628839493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4139705002307892, + "epoch": 8.8, + "learning_rate": 6.005917159763314e-06, + "loss": 0.3043, + "step": 10409, + "task_loss": 0.6088593602180481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3759179711341858, + "epoch": 8.8, + "learning_rate": 6.0016906170752324e-06, + "loss": 0.3491, + "step": 10410, + "task_loss": 0.13358567655086517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5208841562271118, + "epoch": 8.8, + "learning_rate": 5.997464074387152e-06, + "loss": 0.4006, + "step": 10411, + "task_loss": 0.7816829085350037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38995715975761414, + "epoch": 8.8, + "learning_rate": 5.99323753169907e-06, + "loss": 0.436, + "step": 10412, + "task_loss": 2.3727638721466064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24432049691677094, + "epoch": 8.8, + "learning_rate": 5.98901098901099e-06, + "loss": 0.4313, + "step": 10413, + "task_loss": 1.333655834197998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27580273151397705, + "epoch": 8.8, + "learning_rate": 5.984784446322908e-06, + "loss": 0.294, + "step": 10414, + "task_loss": 0.677986741065979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29705536365509033, + "epoch": 8.8, + "learning_rate": 5.980557903634827e-06, + "loss": 0.404, + "step": 10415, + "task_loss": 0.39004799723625183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2986558675765991, + "epoch": 8.8, + "learning_rate": 5.976331360946746e-06, + "loss": 0.2969, + "step": 10416, + "task_loss": 0.14410589635372162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5597624778747559, + "epoch": 8.81, + "learning_rate": 5.972104818258665e-06, + "loss": 0.4689, + "step": 10417, + "task_loss": 1.8824102878570557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.406081885099411, + "epoch": 8.81, + "learning_rate": 5.9678782755705836e-06, + "loss": 0.4155, + "step": 10418, + "task_loss": 0.66753089427948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.233069509267807, + "epoch": 8.81, + "learning_rate": 5.9636517328825025e-06, + "loss": 0.2942, + "step": 10419, + "task_loss": 0.2811150550842285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5123084783554077, + "epoch": 8.81, + "learning_rate": 5.959425190194421e-06, + "loss": 0.4248, + "step": 10420, + "task_loss": 0.651820957660675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5521732568740845, + "epoch": 8.81, + "learning_rate": 5.95519864750634e-06, + "loss": 0.4134, + "step": 10421, + "task_loss": 1.5152124166488647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17878499627113342, + "epoch": 8.81, + "learning_rate": 5.950972104818259e-06, + "loss": 0.3302, + "step": 10422, + "task_loss": 0.19556304812431335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16046608984470367, + "epoch": 8.81, + "learning_rate": 5.946745562130178e-06, + "loss": 0.3065, + "step": 10423, + "task_loss": 1.0488719940185547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3365938067436218, + "epoch": 8.81, + "learning_rate": 5.942519019442096e-06, + "loss": 0.3126, + "step": 10424, + "task_loss": 0.34499451518058777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22020339965820312, + "epoch": 8.81, + "learning_rate": 5.938292476754016e-06, + "loss": 0.3793, + "step": 10425, + "task_loss": 0.5699753761291504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.11950044333934784, + "epoch": 8.81, + "learning_rate": 5.934065934065934e-06, + "loss": 0.4298, + "step": 10426, + "task_loss": 0.35527607798576355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42416703701019287, + "epoch": 8.81, + "learning_rate": 5.929839391377854e-06, + "loss": 0.4323, + "step": 10427, + "task_loss": 0.6153509020805359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20860655605793, + "epoch": 8.81, + "learning_rate": 5.925612848689772e-06, + "loss": 0.2693, + "step": 10428, + "task_loss": 0.5964874625205994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3691558539867401, + "epoch": 8.82, + "learning_rate": 5.921386306001691e-06, + "loss": 0.2642, + "step": 10429, + "task_loss": 0.5073607563972473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42879894375801086, + "epoch": 8.82, + "learning_rate": 5.917159763313609e-06, + "loss": 0.3232, + "step": 10430, + "task_loss": 0.5207659006118774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49050623178482056, + "epoch": 8.82, + "learning_rate": 5.912933220625528e-06, + "loss": 0.3654, + "step": 10431, + "task_loss": 1.8078371286392212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4021673798561096, + "epoch": 8.82, + "learning_rate": 5.908706677937447e-06, + "loss": 0.3943, + "step": 10432, + "task_loss": 0.8762192130088806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33206841349601746, + "epoch": 8.82, + "learning_rate": 5.904480135249366e-06, + "loss": 0.4188, + "step": 10433, + "task_loss": 0.0793519839644432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20196223258972168, + "epoch": 8.82, + "learning_rate": 5.900253592561286e-06, + "loss": 0.2542, + "step": 10434, + "task_loss": 0.05522362142801285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38283276557922363, + "epoch": 8.82, + "learning_rate": 5.896027049873204e-06, + "loss": 0.3751, + "step": 10435, + "task_loss": 1.4365514516830444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2238149642944336, + "epoch": 8.82, + "learning_rate": 5.891800507185123e-06, + "loss": 0.3915, + "step": 10436, + "task_loss": 0.47790297865867615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4081907272338867, + "epoch": 8.82, + "learning_rate": 5.887573964497042e-06, + "loss": 0.4061, + "step": 10437, + "task_loss": 0.8007245063781738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39941054582595825, + "epoch": 8.82, + "learning_rate": 5.8833474218089605e-06, + "loss": 0.3278, + "step": 10438, + "task_loss": 0.9977150559425354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37820160388946533, + "epoch": 8.82, + "learning_rate": 5.8791208791208794e-06, + "loss": 0.3712, + "step": 10439, + "task_loss": 0.6372966766357422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5911603569984436, + "epoch": 8.82, + "learning_rate": 5.874894336432798e-06, + "loss": 0.4269, + "step": 10440, + "task_loss": 0.402508407831192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28881561756134033, + "epoch": 8.83, + "learning_rate": 5.870667793744717e-06, + "loss": 0.3482, + "step": 10441, + "task_loss": 0.2989804446697235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4393112063407898, + "epoch": 8.83, + "learning_rate": 5.866441251056636e-06, + "loss": 0.384, + "step": 10442, + "task_loss": 0.7820533514022827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38091349601745605, + "epoch": 8.83, + "learning_rate": 5.862214708368555e-06, + "loss": 0.2699, + "step": 10443, + "task_loss": 1.0663855075836182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26595228910446167, + "epoch": 8.83, + "learning_rate": 5.857988165680474e-06, + "loss": 0.3555, + "step": 10444, + "task_loss": 0.20775507390499115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47235286235809326, + "epoch": 8.83, + "learning_rate": 5.853761622992393e-06, + "loss": 0.4237, + "step": 10445, + "task_loss": 0.3829287588596344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28024032711982727, + "epoch": 8.83, + "learning_rate": 5.849535080304312e-06, + "loss": 0.2971, + "step": 10446, + "task_loss": 0.5501790046691895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3127982020378113, + "epoch": 8.83, + "learning_rate": 5.84530853761623e-06, + "loss": 0.3462, + "step": 10447, + "task_loss": 0.5126351714134216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5277485847473145, + "epoch": 8.83, + "learning_rate": 5.8410819949281495e-06, + "loss": 0.3637, + "step": 10448, + "task_loss": 0.5456387400627136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2662937641143799, + "epoch": 8.83, + "learning_rate": 5.8368554522400675e-06, + "loss": 0.3779, + "step": 10449, + "task_loss": 1.1496986150741577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38354212045669556, + "epoch": 8.83, + "learning_rate": 5.832628909551987e-06, + "loss": 0.4749, + "step": 10450, + "task_loss": 1.5018688440322876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.477462500333786, + "epoch": 8.83, + "learning_rate": 5.828402366863905e-06, + "loss": 0.5103, + "step": 10451, + "task_loss": 0.9591981768608093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31725820899009705, + "epoch": 8.83, + "learning_rate": 5.824175824175824e-06, + "loss": 0.3904, + "step": 10452, + "task_loss": 0.23489266633987427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47911036014556885, + "epoch": 8.84, + "learning_rate": 5.819949281487743e-06, + "loss": 0.369, + "step": 10453, + "task_loss": 0.6942949295043945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38454657793045044, + "epoch": 8.84, + "learning_rate": 5.815722738799662e-06, + "loss": 0.3301, + "step": 10454, + "task_loss": 0.3719600737094879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.510131299495697, + "epoch": 8.84, + "learning_rate": 5.811496196111581e-06, + "loss": 0.3182, + "step": 10455, + "task_loss": 0.7374346256256104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28108108043670654, + "epoch": 8.84, + "learning_rate": 5.8072696534235e-06, + "loss": 0.4316, + "step": 10456, + "task_loss": 0.5342756509780884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18161022663116455, + "epoch": 8.84, + "learning_rate": 5.803043110735419e-06, + "loss": 0.2911, + "step": 10457, + "task_loss": 0.6893314123153687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45669710636138916, + "epoch": 8.84, + "learning_rate": 5.7988165680473375e-06, + "loss": 0.367, + "step": 10458, + "task_loss": 0.7740815281867981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20296022295951843, + "epoch": 8.84, + "learning_rate": 5.794590025359256e-06, + "loss": 0.3131, + "step": 10459, + "task_loss": 0.3846844434738159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40013182163238525, + "epoch": 8.84, + "learning_rate": 5.790363482671175e-06, + "loss": 0.4496, + "step": 10460, + "task_loss": 1.3673583269119263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20002758502960205, + "epoch": 8.84, + "learning_rate": 5.786136939983094e-06, + "loss": 0.2579, + "step": 10461, + "task_loss": 0.4183551073074341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3325389325618744, + "epoch": 8.84, + "learning_rate": 5.781910397295013e-06, + "loss": 0.3193, + "step": 10462, + "task_loss": 0.5600717067718506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3233494758605957, + "epoch": 8.84, + "learning_rate": 5.777683854606932e-06, + "loss": 0.3674, + "step": 10463, + "task_loss": 0.5678949356079102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3349851965904236, + "epoch": 8.84, + "learning_rate": 5.773457311918851e-06, + "loss": 0.4056, + "step": 10464, + "task_loss": 1.1104687452316284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6231229305267334, + "epoch": 8.85, + "learning_rate": 5.76923076923077e-06, + "loss": 0.3732, + "step": 10465, + "task_loss": 0.6410720348358154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24400070309638977, + "epoch": 8.85, + "learning_rate": 5.765004226542689e-06, + "loss": 0.3433, + "step": 10466, + "task_loss": 0.19105969369411469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38292351365089417, + "epoch": 8.85, + "learning_rate": 5.7607776838546076e-06, + "loss": 0.3699, + "step": 10467, + "task_loss": 1.0845474004745483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39904534816741943, + "epoch": 8.85, + "learning_rate": 5.756551141166526e-06, + "loss": 0.4193, + "step": 10468, + "task_loss": 0.7654553651809692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4003687798976898, + "epoch": 8.85, + "learning_rate": 5.752324598478445e-06, + "loss": 0.3593, + "step": 10469, + "task_loss": 0.22297053039073944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3450118601322174, + "epoch": 8.85, + "learning_rate": 5.748098055790363e-06, + "loss": 0.3391, + "step": 10470, + "task_loss": 0.7097237706184387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4231759309768677, + "epoch": 8.85, + "learning_rate": 5.743871513102283e-06, + "loss": 0.3647, + "step": 10471, + "task_loss": 0.28444379568099976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2966248691082001, + "epoch": 8.85, + "learning_rate": 5.739644970414201e-06, + "loss": 0.366, + "step": 10472, + "task_loss": 0.8217769265174866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.433655321598053, + "epoch": 8.85, + "learning_rate": 5.735418427726121e-06, + "loss": 0.3979, + "step": 10473, + "task_loss": 0.5183531045913696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35510674118995667, + "epoch": 8.85, + "learning_rate": 5.731191885038039e-06, + "loss": 0.3705, + "step": 10474, + "task_loss": 0.24792934954166412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23211008310317993, + "epoch": 8.85, + "learning_rate": 5.726965342349958e-06, + "loss": 0.2957, + "step": 10475, + "task_loss": 0.259744256734848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2525983154773712, + "epoch": 8.85, + "learning_rate": 5.722738799661877e-06, + "loss": 0.3111, + "step": 10476, + "task_loss": 0.5207905769348145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38655757904052734, + "epoch": 8.86, + "learning_rate": 5.718512256973796e-06, + "loss": 0.317, + "step": 10477, + "task_loss": 0.3512536287307739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2320643812417984, + "epoch": 8.86, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.3828, + "step": 10478, + "task_loss": 0.30625253915786743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42895179986953735, + "epoch": 8.86, + "learning_rate": 5.710059171597633e-06, + "loss": 0.3294, + "step": 10479, + "task_loss": 0.753816545009613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35077714920043945, + "epoch": 8.86, + "learning_rate": 5.705832628909552e-06, + "loss": 0.3315, + "step": 10480, + "task_loss": 0.14668454229831696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4610116481781006, + "epoch": 8.86, + "learning_rate": 5.701606086221471e-06, + "loss": 0.3776, + "step": 10481, + "task_loss": 1.0376172065734863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45168840885162354, + "epoch": 8.86, + "learning_rate": 5.69737954353339e-06, + "loss": 0.5557, + "step": 10482, + "task_loss": 0.5488502979278564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5054806470870972, + "epoch": 8.86, + "learning_rate": 5.693153000845309e-06, + "loss": 0.3596, + "step": 10483, + "task_loss": 0.2914064824581146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45483261346817017, + "epoch": 8.86, + "learning_rate": 5.688926458157227e-06, + "loss": 0.3552, + "step": 10484, + "task_loss": 0.3156895041465759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38567960262298584, + "epoch": 8.86, + "learning_rate": 5.684699915469147e-06, + "loss": 0.3767, + "step": 10485, + "task_loss": 0.5318587422370911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21512821316719055, + "epoch": 8.86, + "learning_rate": 5.680473372781065e-06, + "loss": 0.4602, + "step": 10486, + "task_loss": 0.7077475786209106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3648052513599396, + "epoch": 8.86, + "learning_rate": 5.6762468300929845e-06, + "loss": 0.3934, + "step": 10487, + "task_loss": 0.9601614475250244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2327917516231537, + "epoch": 8.87, + "learning_rate": 5.672020287404903e-06, + "loss": 0.3451, + "step": 10488, + "task_loss": 0.7424845695495605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41599369049072266, + "epoch": 8.87, + "learning_rate": 5.667793744716822e-06, + "loss": 0.3154, + "step": 10489, + "task_loss": 0.4143931567668915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5976848006248474, + "epoch": 8.87, + "learning_rate": 5.66356720202874e-06, + "loss": 0.4024, + "step": 10490, + "task_loss": 0.6936287879943848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4617854058742523, + "epoch": 8.87, + "learning_rate": 5.659340659340659e-06, + "loss": 0.3533, + "step": 10491, + "task_loss": 0.5180593729019165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45112961530685425, + "epoch": 8.87, + "learning_rate": 5.655114116652578e-06, + "loss": 0.411, + "step": 10492, + "task_loss": 0.8355237245559692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2177576720714569, + "epoch": 8.87, + "learning_rate": 5.650887573964497e-06, + "loss": 0.2739, + "step": 10493, + "task_loss": 0.5394307374954224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4003145694732666, + "epoch": 8.87, + "learning_rate": 5.646661031276417e-06, + "loss": 0.4188, + "step": 10494, + "task_loss": 1.2934889793395996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4112706184387207, + "epoch": 8.87, + "learning_rate": 5.642434488588335e-06, + "loss": 0.4058, + "step": 10495, + "task_loss": 1.1841808557510376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43930792808532715, + "epoch": 8.87, + "learning_rate": 5.6382079459002546e-06, + "loss": 0.5096, + "step": 10496, + "task_loss": 1.053077220916748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3699222207069397, + "epoch": 8.87, + "learning_rate": 5.633981403212173e-06, + "loss": 0.3921, + "step": 10497, + "task_loss": 0.1674131602048874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4526301622390747, + "epoch": 8.87, + "learning_rate": 5.6297548605240915e-06, + "loss": 0.4165, + "step": 10498, + "task_loss": 0.49757182598114014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31889957189559937, + "epoch": 8.87, + "learning_rate": 5.62552831783601e-06, + "loss": 0.4172, + "step": 10499, + "task_loss": 0.272235244512558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38047248125076294, + "epoch": 8.88, + "learning_rate": 5.621301775147929e-06, + "loss": 0.3946, + "step": 10500, + "task_loss": 1.6199826002120972 + }, + { + "epoch": 8.88, + "eval_accuracy": 0.9176237623762377, + "eval_loss": 0.24891653656959534, + "eval_runtime": 225.3499, + "eval_samples_per_second": 112.048, + "eval_steps_per_second": 0.879, + "step": 10500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3256850242614746, + "epoch": 8.88, + "learning_rate": 5.617075232459848e-06, + "loss": 0.3579, + "step": 10501, + "task_loss": 0.3629542291164398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34127897024154663, + "epoch": 8.88, + "learning_rate": 5.612848689771767e-06, + "loss": 0.346, + "step": 10502, + "task_loss": 0.5949999690055847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33588463068008423, + "epoch": 8.88, + "learning_rate": 5.608622147083686e-06, + "loss": 0.429, + "step": 10503, + "task_loss": 0.6808600425720215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43158018589019775, + "epoch": 8.88, + "learning_rate": 5.604395604395605e-06, + "loss": 0.3416, + "step": 10504, + "task_loss": 0.5876849889755249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2707611322402954, + "epoch": 8.88, + "learning_rate": 5.600169061707524e-06, + "loss": 0.3085, + "step": 10505, + "task_loss": 0.16990092396736145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25542157888412476, + "epoch": 8.88, + "learning_rate": 5.595942519019443e-06, + "loss": 0.2872, + "step": 10506, + "task_loss": 0.294696182012558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.327543169260025, + "epoch": 8.88, + "learning_rate": 5.591715976331361e-06, + "loss": 0.4159, + "step": 10507, + "task_loss": 0.454831600189209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3070724308490753, + "epoch": 8.88, + "learning_rate": 5.58748943364328e-06, + "loss": 0.4121, + "step": 10508, + "task_loss": 0.624697744846344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2310318797826767, + "epoch": 8.88, + "learning_rate": 5.5832628909551985e-06, + "loss": 0.3904, + "step": 10509, + "task_loss": 0.31720322370529175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2536173164844513, + "epoch": 8.88, + "learning_rate": 5.579036348267118e-06, + "loss": 0.2513, + "step": 10510, + "task_loss": 0.24081407487392426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4361970126628876, + "epoch": 8.88, + "learning_rate": 5.574809805579036e-06, + "loss": 0.4089, + "step": 10511, + "task_loss": 0.7810481190681458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32039326429367065, + "epoch": 8.89, + "learning_rate": 5.570583262890956e-06, + "loss": 0.2511, + "step": 10512, + "task_loss": 0.2551576793193817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48474329710006714, + "epoch": 8.89, + "learning_rate": 5.566356720202874e-06, + "loss": 0.3552, + "step": 10513, + "task_loss": 0.8193480372428894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1662999540567398, + "epoch": 8.89, + "learning_rate": 5.562130177514793e-06, + "loss": 0.3201, + "step": 10514, + "task_loss": 0.45684829354286194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30625665187835693, + "epoch": 8.89, + "learning_rate": 5.557903634826712e-06, + "loss": 0.4084, + "step": 10515, + "task_loss": 0.9072417616844177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5004543662071228, + "epoch": 8.89, + "learning_rate": 5.553677092138631e-06, + "loss": 0.3983, + "step": 10516, + "task_loss": 0.5443535447120667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2521968185901642, + "epoch": 8.89, + "learning_rate": 5.54945054945055e-06, + "loss": 0.2959, + "step": 10517, + "task_loss": 0.8739181756973267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34476685523986816, + "epoch": 8.89, + "learning_rate": 5.5452240067624685e-06, + "loss": 0.3635, + "step": 10518, + "task_loss": 0.6010934114456177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3205660581588745, + "epoch": 8.89, + "learning_rate": 5.540997464074387e-06, + "loss": 0.5046, + "step": 10519, + "task_loss": 0.2961041033267975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5317286252975464, + "epoch": 8.89, + "learning_rate": 5.536770921386306e-06, + "loss": 0.3987, + "step": 10520, + "task_loss": 1.2092161178588867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32604026794433594, + "epoch": 8.89, + "learning_rate": 5.532544378698225e-06, + "loss": 0.4587, + "step": 10521, + "task_loss": 0.9963746666908264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6662105321884155, + "epoch": 8.89, + "learning_rate": 5.528317836010144e-06, + "loss": 0.3808, + "step": 10522, + "task_loss": 0.844237744808197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22383232414722443, + "epoch": 8.89, + "learning_rate": 5.524091293322063e-06, + "loss": 0.3186, + "step": 10523, + "task_loss": 0.5449993014335632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32099008560180664, + "epoch": 8.9, + "learning_rate": 5.519864750633982e-06, + "loss": 0.376, + "step": 10524, + "task_loss": 0.07917316257953644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32138481736183167, + "epoch": 8.9, + "learning_rate": 5.515638207945901e-06, + "loss": 0.375, + "step": 10525, + "task_loss": 0.0775611475110054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27948567271232605, + "epoch": 8.9, + "learning_rate": 5.51141166525782e-06, + "loss": 0.3175, + "step": 10526, + "task_loss": 0.47832366824150085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2130252569913864, + "epoch": 8.9, + "learning_rate": 5.5071851225697385e-06, + "loss": 0.3276, + "step": 10527, + "task_loss": 0.43031612038612366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21758508682250977, + "epoch": 8.9, + "learning_rate": 5.502958579881657e-06, + "loss": 0.335, + "step": 10528, + "task_loss": 0.815727174282074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32555532455444336, + "epoch": 8.9, + "learning_rate": 5.498732037193576e-06, + "loss": 0.3433, + "step": 10529, + "task_loss": 0.4198053777217865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.412402868270874, + "epoch": 8.9, + "learning_rate": 5.494505494505494e-06, + "loss": 0.3176, + "step": 10530, + "task_loss": 0.7705665230751038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38290080428123474, + "epoch": 8.9, + "learning_rate": 5.490278951817414e-06, + "loss": 0.364, + "step": 10531, + "task_loss": 0.8077495694160461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3473605513572693, + "epoch": 8.9, + "learning_rate": 5.486052409129332e-06, + "loss": 0.3727, + "step": 10532, + "task_loss": 0.5337679386138916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40602636337280273, + "epoch": 8.9, + "learning_rate": 5.481825866441252e-06, + "loss": 0.4247, + "step": 10533, + "task_loss": 1.3812623023986816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26739975810050964, + "epoch": 8.9, + "learning_rate": 5.47759932375317e-06, + "loss": 0.3322, + "step": 10534, + "task_loss": 1.300783395767212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3053983449935913, + "epoch": 8.9, + "learning_rate": 5.47337278106509e-06, + "loss": 0.3786, + "step": 10535, + "task_loss": 1.3127219676971436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2780449390411377, + "epoch": 8.91, + "learning_rate": 5.469146238377008e-06, + "loss": 0.3548, + "step": 10536, + "task_loss": 0.18683917820453644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17003920674324036, + "epoch": 8.91, + "learning_rate": 5.4649196956889266e-06, + "loss": 0.3385, + "step": 10537, + "task_loss": 0.190005823969841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30373498797416687, + "epoch": 8.91, + "learning_rate": 5.4606931530008455e-06, + "loss": 0.3574, + "step": 10538, + "task_loss": 1.3013101816177368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2664998173713684, + "epoch": 8.91, + "learning_rate": 5.456466610312764e-06, + "loss": 0.4277, + "step": 10539, + "task_loss": 0.08448263257741928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40612179040908813, + "epoch": 8.91, + "learning_rate": 5.452240067624683e-06, + "loss": 0.3665, + "step": 10540, + "task_loss": 1.2882914543151855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6719151139259338, + "epoch": 8.91, + "learning_rate": 5.448013524936602e-06, + "loss": 0.4749, + "step": 10541, + "task_loss": 0.48421168327331543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27277275919914246, + "epoch": 8.91, + "learning_rate": 5.443786982248521e-06, + "loss": 0.3045, + "step": 10542, + "task_loss": 0.15740960836410522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3185015618801117, + "epoch": 8.91, + "learning_rate": 5.43956043956044e-06, + "loss": 0.3226, + "step": 10543, + "task_loss": 0.7993637323379517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.13551494479179382, + "epoch": 8.91, + "learning_rate": 5.435333896872359e-06, + "loss": 0.3003, + "step": 10544, + "task_loss": 0.11653448641300201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48348331451416016, + "epoch": 8.91, + "learning_rate": 5.431107354184278e-06, + "loss": 0.378, + "step": 10545, + "task_loss": 0.6741025447845459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4348740577697754, + "epoch": 8.91, + "learning_rate": 5.426880811496196e-06, + "loss": 0.4626, + "step": 10546, + "task_loss": 0.6881992220878601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2195262908935547, + "epoch": 8.91, + "learning_rate": 5.4226542688081155e-06, + "loss": 0.3281, + "step": 10547, + "task_loss": 0.9424790740013123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34371060132980347, + "epoch": 8.92, + "learning_rate": 5.4184277261200335e-06, + "loss": 0.3617, + "step": 10548, + "task_loss": 0.8610680103302002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3726586103439331, + "epoch": 8.92, + "learning_rate": 5.414201183431953e-06, + "loss": 0.3438, + "step": 10549, + "task_loss": 0.08373191952705383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26128527522087097, + "epoch": 8.92, + "learning_rate": 5.409974640743871e-06, + "loss": 0.3832, + "step": 10550, + "task_loss": 0.42539262771606445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29745981097221375, + "epoch": 8.92, + "learning_rate": 5.405748098055791e-06, + "loss": 0.3173, + "step": 10551, + "task_loss": 0.6949764490127563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3863251209259033, + "epoch": 8.92, + "learning_rate": 5.40152155536771e-06, + "loss": 0.3332, + "step": 10552, + "task_loss": 0.2945582866668701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3376665413379669, + "epoch": 8.92, + "learning_rate": 5.397295012679628e-06, + "loss": 0.3781, + "step": 10553, + "task_loss": 0.2574164867401123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29187387228012085, + "epoch": 8.92, + "learning_rate": 5.393068469991548e-06, + "loss": 0.4159, + "step": 10554, + "task_loss": 0.39668914675712585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43753212690353394, + "epoch": 8.92, + "learning_rate": 5.388841927303466e-06, + "loss": 0.4158, + "step": 10555, + "task_loss": 0.6210050582885742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3267529606819153, + "epoch": 8.92, + "learning_rate": 5.3846153846153855e-06, + "loss": 0.3835, + "step": 10556, + "task_loss": 0.8200408220291138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2754681706428528, + "epoch": 8.92, + "learning_rate": 5.3803888419273036e-06, + "loss": 0.4152, + "step": 10557, + "task_loss": 0.48925378918647766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22396092116832733, + "epoch": 8.92, + "learning_rate": 5.3761622992392224e-06, + "loss": 0.3139, + "step": 10558, + "task_loss": 0.5447292327880859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3004778027534485, + "epoch": 8.93, + "learning_rate": 5.371935756551141e-06, + "loss": 0.4364, + "step": 10559, + "task_loss": 0.8373518586158752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3843916356563568, + "epoch": 8.93, + "learning_rate": 5.36770921386306e-06, + "loss": 0.361, + "step": 10560, + "task_loss": 1.3160759210586548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38844168186187744, + "epoch": 8.93, + "learning_rate": 5.363482671174979e-06, + "loss": 0.3472, + "step": 10561, + "task_loss": 0.3519066870212555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2851048707962036, + "epoch": 8.93, + "learning_rate": 5.359256128486898e-06, + "loss": 0.2805, + "step": 10562, + "task_loss": 0.27651041746139526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24855992197990417, + "epoch": 8.93, + "learning_rate": 5.355029585798817e-06, + "loss": 0.2873, + "step": 10563, + "task_loss": 0.3810814917087555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4621483087539673, + "epoch": 8.93, + "learning_rate": 5.350803043110736e-06, + "loss": 0.3358, + "step": 10564, + "task_loss": 0.6374263167381287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33553358912467957, + "epoch": 8.93, + "learning_rate": 5.346576500422655e-06, + "loss": 0.4768, + "step": 10565, + "task_loss": 0.7039610147476196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1790844053030014, + "epoch": 8.93, + "learning_rate": 5.342349957734574e-06, + "loss": 0.4636, + "step": 10566, + "task_loss": 0.5062955021858215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2807973027229309, + "epoch": 8.93, + "learning_rate": 5.3381234150464925e-06, + "loss": 0.3999, + "step": 10567, + "task_loss": 1.2764369249343872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5061820149421692, + "epoch": 8.93, + "learning_rate": 5.333896872358411e-06, + "loss": 0.398, + "step": 10568, + "task_loss": 0.31959405541419983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5525521636009216, + "epoch": 8.93, + "learning_rate": 5.329670329670329e-06, + "loss": 0.3362, + "step": 10569, + "task_loss": 0.7512363791465759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3614034056663513, + "epoch": 8.93, + "learning_rate": 5.325443786982249e-06, + "loss": 0.3506, + "step": 10570, + "task_loss": 0.43919432163238525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25045835971832275, + "epoch": 8.94, + "learning_rate": 5.321217244294167e-06, + "loss": 0.2651, + "step": 10571, + "task_loss": 0.604154109954834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26306185126304626, + "epoch": 8.94, + "learning_rate": 5.316990701606087e-06, + "loss": 0.3136, + "step": 10572, + "task_loss": 0.4441031813621521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2923782467842102, + "epoch": 8.94, + "learning_rate": 5.312764158918005e-06, + "loss": 0.2962, + "step": 10573, + "task_loss": 0.600132167339325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21817299723625183, + "epoch": 8.94, + "learning_rate": 5.308537616229924e-06, + "loss": 0.3013, + "step": 10574, + "task_loss": 0.23203979432582855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49228203296661377, + "epoch": 8.94, + "learning_rate": 5.304311073541843e-06, + "loss": 0.3477, + "step": 10575, + "task_loss": 1.3442047834396362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19832350313663483, + "epoch": 8.94, + "learning_rate": 5.300084530853762e-06, + "loss": 0.3757, + "step": 10576, + "task_loss": 0.2660848796367645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.508938729763031, + "epoch": 8.94, + "learning_rate": 5.2958579881656805e-06, + "loss": 0.4269, + "step": 10577, + "task_loss": 0.3726608455181122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22324714064598083, + "epoch": 8.94, + "learning_rate": 5.2916314454775994e-06, + "loss": 0.3319, + "step": 10578, + "task_loss": 0.6251605749130249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22403176128864288, + "epoch": 8.94, + "learning_rate": 5.287404902789518e-06, + "loss": 0.2917, + "step": 10579, + "task_loss": 0.17355366051197052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3432474434375763, + "epoch": 8.94, + "learning_rate": 5.283178360101437e-06, + "loss": 0.3461, + "step": 10580, + "task_loss": 0.3043481111526489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2583116590976715, + "epoch": 8.94, + "learning_rate": 5.278951817413356e-06, + "loss": 0.411, + "step": 10581, + "task_loss": 0.6677039265632629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42057859897613525, + "epoch": 8.94, + "learning_rate": 5.274725274725275e-06, + "loss": 0.3854, + "step": 10582, + "task_loss": 0.4723869860172272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4363538324832916, + "epoch": 8.95, + "learning_rate": 5.270498732037194e-06, + "loss": 0.4118, + "step": 10583, + "task_loss": 0.17191146314144135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26022660732269287, + "epoch": 8.95, + "learning_rate": 5.266272189349113e-06, + "loss": 0.3207, + "step": 10584, + "task_loss": 0.6693881750106812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.252693235874176, + "epoch": 8.95, + "learning_rate": 5.262045646661032e-06, + "loss": 0.3499, + "step": 10585, + "task_loss": 0.17671498656272888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29161888360977173, + "epoch": 8.95, + "learning_rate": 5.2578191039729506e-06, + "loss": 0.4442, + "step": 10586, + "task_loss": 1.3715044260025024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3111943304538727, + "epoch": 8.95, + "learning_rate": 5.2535925612848695e-06, + "loss": 0.455, + "step": 10587, + "task_loss": 0.33042800426483154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3734613358974457, + "epoch": 8.95, + "learning_rate": 5.249366018596788e-06, + "loss": 0.3033, + "step": 10588, + "task_loss": 0.5430118441581726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.456808865070343, + "epoch": 8.95, + "learning_rate": 5.245139475908707e-06, + "loss": 0.342, + "step": 10589, + "task_loss": 0.2375822365283966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38724806904792786, + "epoch": 8.95, + "learning_rate": 5.240912933220625e-06, + "loss": 0.3927, + "step": 10590, + "task_loss": 0.35092824697494507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24220506846904755, + "epoch": 8.95, + "learning_rate": 5.236686390532545e-06, + "loss": 0.3321, + "step": 10591, + "task_loss": 0.19416414201259613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46470707654953003, + "epoch": 8.95, + "learning_rate": 5.232459847844463e-06, + "loss": 0.3165, + "step": 10592, + "task_loss": 0.7578445672988892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2870655059814453, + "epoch": 8.95, + "learning_rate": 5.228233305156383e-06, + "loss": 0.292, + "step": 10593, + "task_loss": 0.36816442012786865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5843943953514099, + "epoch": 8.95, + "learning_rate": 5.224006762468301e-06, + "loss": 0.4256, + "step": 10594, + "task_loss": 1.374983549118042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37522709369659424, + "epoch": 8.96, + "learning_rate": 5.219780219780221e-06, + "loss": 0.345, + "step": 10595, + "task_loss": 1.3052809238433838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.361098051071167, + "epoch": 8.96, + "learning_rate": 5.215553677092139e-06, + "loss": 0.341, + "step": 10596, + "task_loss": 0.6361895799636841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3357596695423126, + "epoch": 8.96, + "learning_rate": 5.2113271344040575e-06, + "loss": 0.3858, + "step": 10597, + "task_loss": 0.7915270328521729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4455062747001648, + "epoch": 8.96, + "learning_rate": 5.207100591715976e-06, + "loss": 0.3695, + "step": 10598, + "task_loss": 0.9808911085128784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27943286299705505, + "epoch": 8.96, + "learning_rate": 5.202874049027895e-06, + "loss": 0.3396, + "step": 10599, + "task_loss": 0.47498053312301636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4404623806476593, + "epoch": 8.96, + "learning_rate": 5.198647506339814e-06, + "loss": 0.3583, + "step": 10600, + "task_loss": 0.4487472176551819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17859220504760742, + "epoch": 8.96, + "learning_rate": 5.194420963651733e-06, + "loss": 0.291, + "step": 10601, + "task_loss": 0.6823776960372925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5924414396286011, + "epoch": 8.96, + "learning_rate": 5.190194420963652e-06, + "loss": 0.3913, + "step": 10602, + "task_loss": 0.9030060172080994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2751384675502777, + "epoch": 8.96, + "learning_rate": 5.185967878275571e-06, + "loss": 0.3242, + "step": 10603, + "task_loss": 0.4541858732700348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.534738302230835, + "epoch": 8.96, + "learning_rate": 5.18174133558749e-06, + "loss": 0.4127, + "step": 10604, + "task_loss": 0.8677483797073364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26253435015678406, + "epoch": 8.96, + "learning_rate": 5.177514792899409e-06, + "loss": 0.3494, + "step": 10605, + "task_loss": 0.6880300641059875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4415886402130127, + "epoch": 8.96, + "learning_rate": 5.173288250211327e-06, + "loss": 0.435, + "step": 10606, + "task_loss": 0.9131572246551514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3032299876213074, + "epoch": 8.97, + "learning_rate": 5.1690617075232464e-06, + "loss": 0.3768, + "step": 10607, + "task_loss": 0.2664109766483307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3132404088973999, + "epoch": 8.97, + "learning_rate": 5.1648351648351645e-06, + "loss": 0.368, + "step": 10608, + "task_loss": 0.4143713414669037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1638088971376419, + "epoch": 8.97, + "learning_rate": 5.160608622147084e-06, + "loss": 0.2673, + "step": 10609, + "task_loss": 0.05413144826889038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2843250632286072, + "epoch": 8.97, + "learning_rate": 5.156382079459002e-06, + "loss": 0.3393, + "step": 10610, + "task_loss": 0.11795742064714432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2727190852165222, + "epoch": 8.97, + "learning_rate": 5.152155536770922e-06, + "loss": 0.3207, + "step": 10611, + "task_loss": 0.2639559805393219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32542917132377625, + "epoch": 8.97, + "learning_rate": 5.147928994082841e-06, + "loss": 0.3273, + "step": 10612, + "task_loss": 0.6337400078773499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4037979543209076, + "epoch": 8.97, + "learning_rate": 5.143702451394759e-06, + "loss": 0.4104, + "step": 10613, + "task_loss": 0.6392010450363159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.323793888092041, + "epoch": 8.97, + "learning_rate": 5.139475908706679e-06, + "loss": 0.3123, + "step": 10614, + "task_loss": 0.6191710829734802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2690281569957733, + "epoch": 8.97, + "learning_rate": 5.135249366018597e-06, + "loss": 0.3434, + "step": 10615, + "task_loss": 0.6663408279418945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5430330038070679, + "epoch": 8.97, + "learning_rate": 5.1310228233305165e-06, + "loss": 0.4859, + "step": 10616, + "task_loss": 0.7418842315673828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46147239208221436, + "epoch": 8.97, + "learning_rate": 5.1267962806424345e-06, + "loss": 0.4502, + "step": 10617, + "task_loss": 0.5960937738418579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4016711413860321, + "epoch": 8.97, + "learning_rate": 5.122569737954354e-06, + "loss": 0.3542, + "step": 10618, + "task_loss": 0.8922397494316101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29150354862213135, + "epoch": 8.98, + "learning_rate": 5.118343195266272e-06, + "loss": 0.3051, + "step": 10619, + "task_loss": 0.4391981065273285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5599380731582642, + "epoch": 8.98, + "learning_rate": 5.114116652578191e-06, + "loss": 0.3653, + "step": 10620, + "task_loss": 0.47943735122680664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4838765263557434, + "epoch": 8.98, + "learning_rate": 5.10989010989011e-06, + "loss": 0.3155, + "step": 10621, + "task_loss": 0.4959776997566223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2942233085632324, + "epoch": 8.98, + "learning_rate": 5.105663567202029e-06, + "loss": 0.3763, + "step": 10622, + "task_loss": 0.8853696584701538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4346213638782501, + "epoch": 8.98, + "learning_rate": 5.101437024513948e-06, + "loss": 0.5514, + "step": 10623, + "task_loss": 0.8086497783660889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4018877446651459, + "epoch": 8.98, + "learning_rate": 5.097210481825867e-06, + "loss": 0.3526, + "step": 10624, + "task_loss": 0.9724776148796082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3242490291595459, + "epoch": 8.98, + "learning_rate": 5.092983939137786e-06, + "loss": 0.3416, + "step": 10625, + "task_loss": 0.22248663008213043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4244734048843384, + "epoch": 8.98, + "learning_rate": 5.0887573964497045e-06, + "loss": 0.3521, + "step": 10626, + "task_loss": 0.9324353337287903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22923877835273743, + "epoch": 8.98, + "learning_rate": 5.0845308537616234e-06, + "loss": 0.2864, + "step": 10627, + "task_loss": 0.24742712080478668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26313456892967224, + "epoch": 8.98, + "learning_rate": 5.080304311073542e-06, + "loss": 0.2596, + "step": 10628, + "task_loss": 0.25181740522384644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23418673872947693, + "epoch": 8.98, + "learning_rate": 5.07607776838546e-06, + "loss": 0.3234, + "step": 10629, + "task_loss": 0.4639100432395935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21122485399246216, + "epoch": 8.99, + "learning_rate": 5.07185122569738e-06, + "loss": 0.3472, + "step": 10630, + "task_loss": 0.2071051001548767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1738552451133728, + "epoch": 8.99, + "learning_rate": 5.067624683009298e-06, + "loss": 0.3202, + "step": 10631, + "task_loss": 0.3892912268638611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27671360969543457, + "epoch": 8.99, + "learning_rate": 5.063398140321218e-06, + "loss": 0.3907, + "step": 10632, + "task_loss": 0.31942641735076904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3668547570705414, + "epoch": 8.99, + "learning_rate": 5.059171597633136e-06, + "loss": 0.347, + "step": 10633, + "task_loss": 0.45272907614707947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23243530094623566, + "epoch": 8.99, + "learning_rate": 5.054945054945056e-06, + "loss": 0.3694, + "step": 10634, + "task_loss": 0.1549949049949646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1866087019443512, + "epoch": 8.99, + "learning_rate": 5.050718512256974e-06, + "loss": 0.3542, + "step": 10635, + "task_loss": 0.24225318431854248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3390258550643921, + "epoch": 8.99, + "learning_rate": 5.046491969568893e-06, + "loss": 0.397, + "step": 10636, + "task_loss": 0.0911502093076706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1836473047733307, + "epoch": 8.99, + "learning_rate": 5.0422654268808115e-06, + "loss": 0.3597, + "step": 10637, + "task_loss": 0.5673336386680603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5017399787902832, + "epoch": 8.99, + "learning_rate": 5.03803888419273e-06, + "loss": 0.4054, + "step": 10638, + "task_loss": 0.3923577666282654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40955376625061035, + "epoch": 8.99, + "learning_rate": 5.033812341504649e-06, + "loss": 0.373, + "step": 10639, + "task_loss": 0.7415474653244019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23151738941669464, + "epoch": 8.99, + "learning_rate": 5.029585798816568e-06, + "loss": 0.2752, + "step": 10640, + "task_loss": 0.3509952425956726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34489303827285767, + "epoch": 8.99, + "learning_rate": 5.025359256128487e-06, + "loss": 0.3318, + "step": 10641, + "task_loss": 0.47115615010261536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32184505462646484, + "epoch": 9.0, + "learning_rate": 5.021132713440406e-06, + "loss": 0.4878, + "step": 10642, + "task_loss": 0.10446401685476303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3233630061149597, + "epoch": 9.0, + "learning_rate": 5.016906170752325e-06, + "loss": 0.4879, + "step": 10643, + "task_loss": 0.3864535093307495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3556531071662903, + "epoch": 9.0, + "learning_rate": 5.012679628064244e-06, + "loss": 0.3533, + "step": 10644, + "task_loss": 0.591995894908905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4322413206100464, + "epoch": 9.0, + "learning_rate": 5.008453085376163e-06, + "loss": 0.5175, + "step": 10645, + "task_loss": 0.6103222370147705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43343114852905273, + "epoch": 9.0, + "learning_rate": 5.0042265426880815e-06, + "loss": 0.3903, + "step": 10646, + "task_loss": 0.8915157318115234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4003659784793854, + "epoch": 9.0, + "learning_rate": 5e-06, + "loss": 0.4182, + "step": 10647, + "task_loss": 0.21519266068935394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2912815511226654, + "epoch": 9.0, + "learning_rate": 4.995773457311919e-06, + "loss": 0.6289, + "step": 10648, + "task_loss": 0.20611804723739624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5121369361877441, + "epoch": 9.0, + "learning_rate": 4.991546914623838e-06, + "loss": 0.3234, + "step": 10649, + "task_loss": 1.0983569622039795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5387423634529114, + "epoch": 9.0, + "learning_rate": 4.987320371935757e-06, + "loss": 0.4862, + "step": 10650, + "task_loss": 0.6497392058372498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.463063508272171, + "epoch": 9.0, + "learning_rate": 4.983093829247676e-06, + "loss": 0.4042, + "step": 10651, + "task_loss": 0.5647193193435669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32789114117622375, + "epoch": 9.0, + "learning_rate": 4.978867286559594e-06, + "loss": 0.3413, + "step": 10652, + "task_loss": 0.18238121271133423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4732763469219208, + "epoch": 9.01, + "learning_rate": 4.974640743871514e-06, + "loss": 0.4078, + "step": 10653, + "task_loss": 0.5495404005050659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14766304194927216, + "epoch": 9.01, + "learning_rate": 4.970414201183432e-06, + "loss": 0.3616, + "step": 10654, + "task_loss": 0.09680415689945221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36063235998153687, + "epoch": 9.01, + "learning_rate": 4.9661876584953515e-06, + "loss": 0.2867, + "step": 10655, + "task_loss": 0.03089725971221924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4248960018157959, + "epoch": 9.01, + "learning_rate": 4.96196111580727e-06, + "loss": 0.3523, + "step": 10656, + "task_loss": 0.6082748770713806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3456171452999115, + "epoch": 9.01, + "learning_rate": 4.957734573119189e-06, + "loss": 0.3932, + "step": 10657, + "task_loss": 0.5271035432815552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38025981187820435, + "epoch": 9.01, + "learning_rate": 4.953508030431107e-06, + "loss": 0.3571, + "step": 10658, + "task_loss": 0.7842199802398682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4166877865791321, + "epoch": 9.01, + "learning_rate": 4.949281487743026e-06, + "loss": 0.418, + "step": 10659, + "task_loss": 0.5476058721542358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4175650477409363, + "epoch": 9.01, + "learning_rate": 4.945054945054945e-06, + "loss": 0.3421, + "step": 10660, + "task_loss": 0.261981338262558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2540813386440277, + "epoch": 9.01, + "learning_rate": 4.940828402366864e-06, + "loss": 0.3023, + "step": 10661, + "task_loss": 0.3522268235683441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29951393604278564, + "epoch": 9.01, + "learning_rate": 4.936601859678783e-06, + "loss": 0.3398, + "step": 10662, + "task_loss": 0.7414291501045227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.280769944190979, + "epoch": 9.01, + "learning_rate": 4.932375316990702e-06, + "loss": 0.3469, + "step": 10663, + "task_loss": 0.8019564151763916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3119147717952728, + "epoch": 9.01, + "learning_rate": 4.928148774302621e-06, + "loss": 0.4426, + "step": 10664, + "task_loss": 0.06697451323270798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25725752115249634, + "epoch": 9.02, + "learning_rate": 4.92392223161454e-06, + "loss": 0.3687, + "step": 10665, + "task_loss": 0.7880474328994751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22658953070640564, + "epoch": 9.02, + "learning_rate": 4.9196956889264585e-06, + "loss": 0.2822, + "step": 10666, + "task_loss": 0.8439580798149109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.581402599811554, + "epoch": 9.02, + "learning_rate": 4.915469146238377e-06, + "loss": 0.5299, + "step": 10667, + "task_loss": 0.35453835129737854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34796902537345886, + "epoch": 9.02, + "learning_rate": 4.9112426035502954e-06, + "loss": 0.335, + "step": 10668, + "task_loss": 0.6323029398918152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31796878576278687, + "epoch": 9.02, + "learning_rate": 4.907016060862215e-06, + "loss": 0.457, + "step": 10669, + "task_loss": 0.7333263158798218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5624836087226868, + "epoch": 9.02, + "learning_rate": 4.902789518174133e-06, + "loss": 0.3591, + "step": 10670, + "task_loss": 0.7293015718460083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34871602058410645, + "epoch": 9.02, + "learning_rate": 4.898562975486053e-06, + "loss": 0.3605, + "step": 10671, + "task_loss": 0.4668397307395935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36086493730545044, + "epoch": 9.02, + "learning_rate": 4.894336432797972e-06, + "loss": 0.3219, + "step": 10672, + "task_loss": 0.6964598894119263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3666614592075348, + "epoch": 9.02, + "learning_rate": 4.89010989010989e-06, + "loss": 0.3484, + "step": 10673, + "task_loss": 0.6715587973594666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27762657403945923, + "epoch": 9.02, + "learning_rate": 4.88588334742181e-06, + "loss": 0.345, + "step": 10674, + "task_loss": 0.7563305497169495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27642008662223816, + "epoch": 9.02, + "learning_rate": 4.881656804733728e-06, + "loss": 0.2783, + "step": 10675, + "task_loss": 0.6613790988922119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5069283843040466, + "epoch": 9.02, + "learning_rate": 4.877430262045647e-06, + "loss": 0.3958, + "step": 10676, + "task_loss": 0.3895402252674103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28122851252555847, + "epoch": 9.03, + "learning_rate": 4.8732037193575655e-06, + "loss": 0.2782, + "step": 10677, + "task_loss": 0.2643376290798187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49160489439964294, + "epoch": 9.03, + "learning_rate": 4.868977176669485e-06, + "loss": 0.3899, + "step": 10678, + "task_loss": 1.1128556728363037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41654402017593384, + "epoch": 9.03, + "learning_rate": 4.864750633981403e-06, + "loss": 0.3316, + "step": 10679, + "task_loss": 0.4175534248352051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3051929473876953, + "epoch": 9.03, + "learning_rate": 4.860524091293322e-06, + "loss": 0.3463, + "step": 10680, + "task_loss": 0.9455059766769409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43045341968536377, + "epoch": 9.03, + "learning_rate": 4.856297548605241e-06, + "loss": 0.3645, + "step": 10681, + "task_loss": 0.6507662534713745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47211459279060364, + "epoch": 9.03, + "learning_rate": 4.85207100591716e-06, + "loss": 0.4446, + "step": 10682, + "task_loss": 0.988862931728363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20833662152290344, + "epoch": 9.03, + "learning_rate": 4.847844463229079e-06, + "loss": 0.3528, + "step": 10683, + "task_loss": 1.0985119342803955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35033586621284485, + "epoch": 9.03, + "learning_rate": 4.843617920540998e-06, + "loss": 0.3259, + "step": 10684, + "task_loss": 0.6822803020477295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28949639201164246, + "epoch": 9.03, + "learning_rate": 4.839391377852917e-06, + "loss": 0.3673, + "step": 10685, + "task_loss": 0.2227640151977539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31784504652023315, + "epoch": 9.03, + "learning_rate": 4.8351648351648355e-06, + "loss": 0.3665, + "step": 10686, + "task_loss": 1.166419506072998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4475977122783661, + "epoch": 9.03, + "learning_rate": 4.830938292476754e-06, + "loss": 0.4198, + "step": 10687, + "task_loss": 0.18233591318130493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3862488269805908, + "epoch": 9.03, + "learning_rate": 4.826711749788673e-06, + "loss": 0.4287, + "step": 10688, + "task_loss": 0.83706134557724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.261260986328125, + "epoch": 9.04, + "learning_rate": 4.822485207100591e-06, + "loss": 0.35, + "step": 10689, + "task_loss": 0.20080877840518951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35010775923728943, + "epoch": 9.04, + "learning_rate": 4.818258664412511e-06, + "loss": 0.3058, + "step": 10690, + "task_loss": 0.38369935750961304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45298463106155396, + "epoch": 9.04, + "learning_rate": 4.814032121724429e-06, + "loss": 0.4095, + "step": 10691, + "task_loss": 0.8317689299583435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46889349818229675, + "epoch": 9.04, + "learning_rate": 4.809805579036349e-06, + "loss": 0.3966, + "step": 10692, + "task_loss": 0.9253939390182495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2578296363353729, + "epoch": 9.04, + "learning_rate": 4.805579036348267e-06, + "loss": 0.4157, + "step": 10693, + "task_loss": 0.5661696195602417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20081967115402222, + "epoch": 9.04, + "learning_rate": 4.801352493660187e-06, + "loss": 0.397, + "step": 10694, + "task_loss": 0.5611146688461304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.491626501083374, + "epoch": 9.04, + "learning_rate": 4.797125950972105e-06, + "loss": 0.4398, + "step": 10695, + "task_loss": 0.5792623162269592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19662265479564667, + "epoch": 9.04, + "learning_rate": 4.7928994082840236e-06, + "loss": 0.2359, + "step": 10696, + "task_loss": 0.2238200604915619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14374740421772003, + "epoch": 9.04, + "learning_rate": 4.7886728655959424e-06, + "loss": 0.2979, + "step": 10697, + "task_loss": 0.1255718171596527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30877968668937683, + "epoch": 9.04, + "learning_rate": 4.784446322907861e-06, + "loss": 0.4171, + "step": 10698, + "task_loss": 0.3169753849506378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3452456593513489, + "epoch": 9.04, + "learning_rate": 4.78021978021978e-06, + "loss": 0.3369, + "step": 10699, + "task_loss": 1.0453174114227295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2903060019016266, + "epoch": 9.04, + "learning_rate": 4.775993237531699e-06, + "loss": 0.3928, + "step": 10700, + "task_loss": 0.5046507120132446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23889023065567017, + "epoch": 9.05, + "learning_rate": 4.771766694843619e-06, + "loss": 0.3848, + "step": 10701, + "task_loss": 0.055103663355112076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5400428771972656, + "epoch": 9.05, + "learning_rate": 4.767540152155537e-06, + "loss": 0.3857, + "step": 10702, + "task_loss": 0.7502389550209045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2559239864349365, + "epoch": 9.05, + "learning_rate": 4.763313609467456e-06, + "loss": 0.3334, + "step": 10703, + "task_loss": 0.38283246755599976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3501482307910919, + "epoch": 9.05, + "learning_rate": 4.759087066779375e-06, + "loss": 0.4123, + "step": 10704, + "task_loss": 1.1107497215270996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3360772430896759, + "epoch": 9.05, + "learning_rate": 4.754860524091294e-06, + "loss": 0.3065, + "step": 10705, + "task_loss": 0.1607997566461563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15610381960868835, + "epoch": 9.05, + "learning_rate": 4.7506339814032125e-06, + "loss": 0.3427, + "step": 10706, + "task_loss": 0.2609812021255493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2942715883255005, + "epoch": 9.05, + "learning_rate": 4.746407438715131e-06, + "loss": 0.4206, + "step": 10707, + "task_loss": 0.6877492070198059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.229124516248703, + "epoch": 9.05, + "learning_rate": 4.74218089602705e-06, + "loss": 0.4248, + "step": 10708, + "task_loss": 0.931330680847168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2846606969833374, + "epoch": 9.05, + "learning_rate": 4.737954353338969e-06, + "loss": 0.3611, + "step": 10709, + "task_loss": 0.6238901615142822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5292601585388184, + "epoch": 9.05, + "learning_rate": 4.733727810650888e-06, + "loss": 0.477, + "step": 10710, + "task_loss": 0.5242747664451599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25060832500457764, + "epoch": 9.05, + "learning_rate": 4.729501267962807e-06, + "loss": 0.3025, + "step": 10711, + "task_loss": 0.5326293110847473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3296988606452942, + "epoch": 9.05, + "learning_rate": 4.725274725274725e-06, + "loss": 0.3033, + "step": 10712, + "task_loss": 0.3291831314563751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2981824278831482, + "epoch": 9.06, + "learning_rate": 4.721048182586645e-06, + "loss": 0.4077, + "step": 10713, + "task_loss": 0.39480432868003845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4494365155696869, + "epoch": 9.06, + "learning_rate": 4.716821639898563e-06, + "loss": 0.3906, + "step": 10714, + "task_loss": 1.0682185888290405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5336811542510986, + "epoch": 9.06, + "learning_rate": 4.7125950972104825e-06, + "loss": 0.343, + "step": 10715, + "task_loss": 0.46252161264419556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2937215268611908, + "epoch": 9.06, + "learning_rate": 4.7083685545224005e-06, + "loss": 0.3373, + "step": 10716, + "task_loss": 0.7945277690887451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2907503545284271, + "epoch": 9.06, + "learning_rate": 4.70414201183432e-06, + "loss": 0.3886, + "step": 10717, + "task_loss": 0.5997141599655151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26890385150909424, + "epoch": 9.06, + "learning_rate": 4.699915469146238e-06, + "loss": 0.3577, + "step": 10718, + "task_loss": 0.7894372940063477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39001747965812683, + "epoch": 9.06, + "learning_rate": 4.695688926458157e-06, + "loss": 0.3316, + "step": 10719, + "task_loss": 0.1928696632385254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32406699657440186, + "epoch": 9.06, + "learning_rate": 4.691462383770076e-06, + "loss": 0.2641, + "step": 10720, + "task_loss": 0.31352487206459045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2600927948951721, + "epoch": 9.06, + "learning_rate": 4.687235841081995e-06, + "loss": 0.3162, + "step": 10721, + "task_loss": 0.42545148730278015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3026631474494934, + "epoch": 9.06, + "learning_rate": 4.683009298393914e-06, + "loss": 0.3172, + "step": 10722, + "task_loss": 0.15800407528877258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2899171710014343, + "epoch": 9.06, + "learning_rate": 4.678782755705833e-06, + "loss": 0.367, + "step": 10723, + "task_loss": 0.1556074321269989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2743762731552124, + "epoch": 9.07, + "learning_rate": 4.674556213017752e-06, + "loss": 0.4027, + "step": 10724, + "task_loss": 0.45050162076950073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4501574635505676, + "epoch": 9.07, + "learning_rate": 4.6703296703296706e-06, + "loss": 0.4629, + "step": 10725, + "task_loss": 0.8416013717651367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3481593728065491, + "epoch": 9.07, + "learning_rate": 4.6661031276415895e-06, + "loss": 0.3863, + "step": 10726, + "task_loss": 0.059805918484926224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5137248039245605, + "epoch": 9.07, + "learning_rate": 4.661876584953508e-06, + "loss": 0.4973, + "step": 10727, + "task_loss": 1.0285333395004272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31665855646133423, + "epoch": 9.07, + "learning_rate": 4.657650042265426e-06, + "loss": 0.3929, + "step": 10728, + "task_loss": 1.3786296844482422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2799319624900818, + "epoch": 9.07, + "learning_rate": 4.653423499577346e-06, + "loss": 0.4027, + "step": 10729, + "task_loss": 1.0779032707214355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3463509976863861, + "epoch": 9.07, + "learning_rate": 4.649196956889264e-06, + "loss": 0.3102, + "step": 10730, + "task_loss": 1.2674719095230103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3920883536338806, + "epoch": 9.07, + "learning_rate": 4.644970414201184e-06, + "loss": 0.3149, + "step": 10731, + "task_loss": 0.42668280005455017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26851990818977356, + "epoch": 9.07, + "learning_rate": 4.640743871513103e-06, + "loss": 0.4617, + "step": 10732, + "task_loss": 0.7153639793395996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5444235801696777, + "epoch": 9.07, + "learning_rate": 4.636517328825022e-06, + "loss": 0.393, + "step": 10733, + "task_loss": 1.545829176902771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38023605942726135, + "epoch": 9.07, + "learning_rate": 4.632290786136941e-06, + "loss": 0.3651, + "step": 10734, + "task_loss": 0.7259161472320557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23945069313049316, + "epoch": 9.07, + "learning_rate": 4.628064243448859e-06, + "loss": 0.2796, + "step": 10735, + "task_loss": 0.5746563076972961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5219240784645081, + "epoch": 9.08, + "learning_rate": 4.623837700760778e-06, + "loss": 0.3858, + "step": 10736, + "task_loss": 0.5761024951934814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1435006558895111, + "epoch": 9.08, + "learning_rate": 4.619611158072696e-06, + "loss": 0.3705, + "step": 10737, + "task_loss": 0.3132264018058777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34755784273147583, + "epoch": 9.08, + "learning_rate": 4.615384615384616e-06, + "loss": 0.3582, + "step": 10738, + "task_loss": 0.6461994051933289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3404172360897064, + "epoch": 9.08, + "learning_rate": 4.611158072696534e-06, + "loss": 0.4017, + "step": 10739, + "task_loss": 0.5304747819900513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1973743438720703, + "epoch": 9.08, + "learning_rate": 4.606931530008454e-06, + "loss": 0.2858, + "step": 10740, + "task_loss": 0.7629637122154236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3983118236064911, + "epoch": 9.08, + "learning_rate": 4.602704987320372e-06, + "loss": 0.5235, + "step": 10741, + "task_loss": 0.8805110454559326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3310662508010864, + "epoch": 9.08, + "learning_rate": 4.598478444632291e-06, + "loss": 0.369, + "step": 10742, + "task_loss": 0.5598223805427551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46633660793304443, + "epoch": 9.08, + "learning_rate": 4.59425190194421e-06, + "loss": 0.3477, + "step": 10743, + "task_loss": 0.6799585819244385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42530912160873413, + "epoch": 9.08, + "learning_rate": 4.590025359256129e-06, + "loss": 0.3234, + "step": 10744, + "task_loss": 0.3565773367881775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31919875741004944, + "epoch": 9.08, + "learning_rate": 4.5857988165680475e-06, + "loss": 0.3198, + "step": 10745, + "task_loss": 0.47649940848350525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22381871938705444, + "epoch": 9.08, + "learning_rate": 4.5815722738799664e-06, + "loss": 0.3088, + "step": 10746, + "task_loss": 0.16297003626823425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4028918147087097, + "epoch": 9.08, + "learning_rate": 4.577345731191885e-06, + "loss": 0.3986, + "step": 10747, + "task_loss": 1.0495113134384155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21191802620887756, + "epoch": 9.09, + "learning_rate": 4.573119188503804e-06, + "loss": 0.3383, + "step": 10748, + "task_loss": 0.21829430758953094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4434305429458618, + "epoch": 9.09, + "learning_rate": 4.568892645815723e-06, + "loss": 0.4155, + "step": 10749, + "task_loss": 1.6755656003952026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3997635543346405, + "epoch": 9.09, + "learning_rate": 4.564666103127642e-06, + "loss": 0.326, + "step": 10750, + "task_loss": 0.22782394289970398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2671975791454315, + "epoch": 9.09, + "learning_rate": 4.56043956043956e-06, + "loss": 0.4, + "step": 10751, + "task_loss": 0.3082874119281769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20449531078338623, + "epoch": 9.09, + "learning_rate": 4.55621301775148e-06, + "loss": 0.3801, + "step": 10752, + "task_loss": 0.5542439222335815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23802286386489868, + "epoch": 9.09, + "learning_rate": 4.551986475063398e-06, + "loss": 0.354, + "step": 10753, + "task_loss": 0.36673253774642944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2485799491405487, + "epoch": 9.09, + "learning_rate": 4.5477599323753176e-06, + "loss": 0.3071, + "step": 10754, + "task_loss": 0.6225672364234924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46099853515625, + "epoch": 9.09, + "learning_rate": 4.543533389687236e-06, + "loss": 0.4224, + "step": 10755, + "task_loss": 0.7647070288658142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2656293213367462, + "epoch": 9.09, + "learning_rate": 4.539306846999155e-06, + "loss": 0.3815, + "step": 10756, + "task_loss": 0.534451425075531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3697848320007324, + "epoch": 9.09, + "learning_rate": 4.535080304311073e-06, + "loss": 0.385, + "step": 10757, + "task_loss": 0.804455041885376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3053297996520996, + "epoch": 9.09, + "learning_rate": 4.530853761622992e-06, + "loss": 0.3177, + "step": 10758, + "task_loss": 1.5795748233795166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.222305029630661, + "epoch": 9.09, + "learning_rate": 4.526627218934911e-06, + "loss": 0.3634, + "step": 10759, + "task_loss": 0.5249326229095459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4668792486190796, + "epoch": 9.1, + "learning_rate": 4.52240067624683e-06, + "loss": 0.3531, + "step": 10760, + "task_loss": 0.5021543502807617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2608847916126251, + "epoch": 9.1, + "learning_rate": 4.51817413355875e-06, + "loss": 0.3592, + "step": 10761, + "task_loss": 0.014046192169189453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22430363297462463, + "epoch": 9.1, + "learning_rate": 4.513947590870668e-06, + "loss": 0.2756, + "step": 10762, + "task_loss": 0.2467626929283142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2924463748931885, + "epoch": 9.1, + "learning_rate": 4.509721048182587e-06, + "loss": 0.3585, + "step": 10763, + "task_loss": 0.2170679122209549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5852993726730347, + "epoch": 9.1, + "learning_rate": 4.505494505494506e-06, + "loss": 0.378, + "step": 10764, + "task_loss": 0.2560487687587738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27196985483169556, + "epoch": 9.1, + "learning_rate": 4.5012679628064245e-06, + "loss": 0.383, + "step": 10765, + "task_loss": 0.8842899799346924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37693604826927185, + "epoch": 9.1, + "learning_rate": 4.497041420118343e-06, + "loss": 0.415, + "step": 10766, + "task_loss": 0.18818359076976776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39774179458618164, + "epoch": 9.1, + "learning_rate": 4.492814877430262e-06, + "loss": 0.4105, + "step": 10767, + "task_loss": 1.410965085029602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27049389481544495, + "epoch": 9.1, + "learning_rate": 4.488588334742181e-06, + "loss": 0.4349, + "step": 10768, + "task_loss": 0.6354483962059021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.347796767950058, + "epoch": 9.1, + "learning_rate": 4.4843617920541e-06, + "loss": 0.3914, + "step": 10769, + "task_loss": 0.4951612055301666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2434314489364624, + "epoch": 9.1, + "learning_rate": 4.480135249366019e-06, + "loss": 0.3397, + "step": 10770, + "task_loss": 0.061899181455373764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19929280877113342, + "epoch": 9.1, + "learning_rate": 4.475908706677938e-06, + "loss": 0.3087, + "step": 10771, + "task_loss": 0.3122748136520386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35466569662094116, + "epoch": 9.11, + "learning_rate": 4.471682163989857e-06, + "loss": 0.369, + "step": 10772, + "task_loss": 0.11722215265035629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4237186014652252, + "epoch": 9.11, + "learning_rate": 4.467455621301776e-06, + "loss": 0.4373, + "step": 10773, + "task_loss": 1.364401936531067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5618196725845337, + "epoch": 9.11, + "learning_rate": 4.463229078613694e-06, + "loss": 0.3597, + "step": 10774, + "task_loss": 0.7290211915969849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4054659605026245, + "epoch": 9.11, + "learning_rate": 4.4590025359256134e-06, + "loss": 0.4968, + "step": 10775, + "task_loss": 0.7176336050033569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4422805905342102, + "epoch": 9.11, + "learning_rate": 4.4547759932375315e-06, + "loss": 0.3071, + "step": 10776, + "task_loss": 0.10663993656635284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5104308128356934, + "epoch": 9.11, + "learning_rate": 4.450549450549451e-06, + "loss": 0.3749, + "step": 10777, + "task_loss": 1.015378713607788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5120430588722229, + "epoch": 9.11, + "learning_rate": 4.446322907861369e-06, + "loss": 0.5268, + "step": 10778, + "task_loss": 0.28718680143356323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5109079480171204, + "epoch": 9.11, + "learning_rate": 4.442096365173288e-06, + "loss": 0.423, + "step": 10779, + "task_loss": 0.2126019448041916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1872909963130951, + "epoch": 9.11, + "learning_rate": 4.437869822485207e-06, + "loss": 0.3409, + "step": 10780, + "task_loss": 0.8963184356689453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.401829332113266, + "epoch": 9.11, + "learning_rate": 4.433643279797126e-06, + "loss": 0.3096, + "step": 10781, + "task_loss": 0.4105207622051239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23689891397953033, + "epoch": 9.11, + "learning_rate": 4.429416737109045e-06, + "loss": 0.2445, + "step": 10782, + "task_loss": 0.8406711220741272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.289004921913147, + "epoch": 9.11, + "learning_rate": 4.425190194420964e-06, + "loss": 0.3941, + "step": 10783, + "task_loss": 0.5913785099983215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42233243584632874, + "epoch": 9.12, + "learning_rate": 4.420963651732883e-06, + "loss": 0.4009, + "step": 10784, + "task_loss": 0.19679613411426544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3874870538711548, + "epoch": 9.12, + "learning_rate": 4.4167371090448015e-06, + "loss": 0.3574, + "step": 10785, + "task_loss": 1.3830676078796387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3288381099700928, + "epoch": 9.12, + "learning_rate": 4.41251056635672e-06, + "loss": 0.4197, + "step": 10786, + "task_loss": 0.43626508116722107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5049364566802979, + "epoch": 9.12, + "learning_rate": 4.408284023668639e-06, + "loss": 0.3566, + "step": 10787, + "task_loss": 1.0110511779785156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2575474977493286, + "epoch": 9.12, + "learning_rate": 4.404057480980558e-06, + "loss": 0.373, + "step": 10788, + "task_loss": 0.2532555162906647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3124488890171051, + "epoch": 9.12, + "learning_rate": 4.399830938292477e-06, + "loss": 0.4488, + "step": 10789, + "task_loss": 0.5693467259407043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25977104902267456, + "epoch": 9.12, + "learning_rate": 4.395604395604396e-06, + "loss": 0.317, + "step": 10790, + "task_loss": 0.5720422267913818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29429638385772705, + "epoch": 9.12, + "learning_rate": 4.391377852916315e-06, + "loss": 0.3396, + "step": 10791, + "task_loss": 0.5651503801345825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1949886828660965, + "epoch": 9.12, + "learning_rate": 4.387151310228234e-06, + "loss": 0.2594, + "step": 10792, + "task_loss": 0.9995611906051636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.453154981136322, + "epoch": 9.12, + "learning_rate": 4.382924767540153e-06, + "loss": 0.5599, + "step": 10793, + "task_loss": 0.697390615940094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.410897433757782, + "epoch": 9.12, + "learning_rate": 4.3786982248520715e-06, + "loss": 0.5007, + "step": 10794, + "task_loss": 0.3754793107509613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24273744225502014, + "epoch": 9.13, + "learning_rate": 4.37447168216399e-06, + "loss": 0.2748, + "step": 10795, + "task_loss": 0.3099212050437927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46858909726142883, + "epoch": 9.13, + "learning_rate": 4.370245139475909e-06, + "loss": 0.442, + "step": 10796, + "task_loss": 0.5783044099807739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23016443848609924, + "epoch": 9.13, + "learning_rate": 4.366018596787827e-06, + "loss": 0.3046, + "step": 10797, + "task_loss": 0.11538434773683548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27203109860420227, + "epoch": 9.13, + "learning_rate": 4.361792054099747e-06, + "loss": 0.3768, + "step": 10798, + "task_loss": 0.19954493641853333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3558979034423828, + "epoch": 9.13, + "learning_rate": 4.357565511411665e-06, + "loss": 0.3013, + "step": 10799, + "task_loss": 0.2688828408718109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4067249000072479, + "epoch": 9.13, + "learning_rate": 4.353338968723585e-06, + "loss": 0.4609, + "step": 10800, + "task_loss": 0.668369710445404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2674903869628906, + "epoch": 9.13, + "learning_rate": 4.349112426035503e-06, + "loss": 0.4023, + "step": 10801, + "task_loss": 0.5800718665122986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35724711418151855, + "epoch": 9.13, + "learning_rate": 4.344885883347422e-06, + "loss": 0.3171, + "step": 10802, + "task_loss": 0.3377666175365448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.349151611328125, + "epoch": 9.13, + "learning_rate": 4.340659340659341e-06, + "loss": 0.4415, + "step": 10803, + "task_loss": 1.025716781616211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3418132960796356, + "epoch": 9.13, + "learning_rate": 4.33643279797126e-06, + "loss": 0.3339, + "step": 10804, + "task_loss": 0.8026849031448364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34848421812057495, + "epoch": 9.13, + "learning_rate": 4.3322062552831785e-06, + "loss": 0.3832, + "step": 10805, + "task_loss": 0.3262981176376343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14944732189178467, + "epoch": 9.13, + "learning_rate": 4.327979712595097e-06, + "loss": 0.2701, + "step": 10806, + "task_loss": 0.06320273876190186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1887546330690384, + "epoch": 9.14, + "learning_rate": 4.323753169907016e-06, + "loss": 0.2458, + "step": 10807, + "task_loss": 0.1402512639760971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2622518837451935, + "epoch": 9.14, + "learning_rate": 4.319526627218935e-06, + "loss": 0.2813, + "step": 10808, + "task_loss": 0.18590986728668213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49820035696029663, + "epoch": 9.14, + "learning_rate": 4.315300084530854e-06, + "loss": 0.299, + "step": 10809, + "task_loss": 0.6286423206329346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23891744017601013, + "epoch": 9.14, + "learning_rate": 4.311073541842773e-06, + "loss": 0.3337, + "step": 10810, + "task_loss": 0.37884652614593506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3054962754249573, + "epoch": 9.14, + "learning_rate": 4.306846999154691e-06, + "loss": 0.3747, + "step": 10811, + "task_loss": 0.19021984934806824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19608978927135468, + "epoch": 9.14, + "learning_rate": 4.302620456466611e-06, + "loss": 0.3385, + "step": 10812, + "task_loss": 0.13306094706058502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5409775972366333, + "epoch": 9.14, + "learning_rate": 4.298393913778529e-06, + "loss": 0.3888, + "step": 10813, + "task_loss": 0.7704942226409912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48843520879745483, + "epoch": 9.14, + "learning_rate": 4.2941673710904485e-06, + "loss": 0.4052, + "step": 10814, + "task_loss": 0.6687980890274048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21563029289245605, + "epoch": 9.14, + "learning_rate": 4.2899408284023666e-06, + "loss": 0.3536, + "step": 10815, + "task_loss": 0.4911973476409912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2460443526506424, + "epoch": 9.14, + "learning_rate": 4.285714285714286e-06, + "loss": 0.3092, + "step": 10816, + "task_loss": 0.2874479591846466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45760709047317505, + "epoch": 9.14, + "learning_rate": 4.281487743026204e-06, + "loss": 0.4711, + "step": 10817, + "task_loss": 0.44065314531326294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4063870906829834, + "epoch": 9.14, + "learning_rate": 4.277261200338123e-06, + "loss": 0.4091, + "step": 10818, + "task_loss": 0.7656653523445129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16463279724121094, + "epoch": 9.15, + "learning_rate": 4.273034657650042e-06, + "loss": 0.2469, + "step": 10819, + "task_loss": 0.16565841436386108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3141283094882965, + "epoch": 9.15, + "learning_rate": 4.268808114961961e-06, + "loss": 0.3301, + "step": 10820, + "task_loss": 0.32154935598373413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2942084074020386, + "epoch": 9.15, + "learning_rate": 4.264581572273881e-06, + "loss": 0.4, + "step": 10821, + "task_loss": 0.3492378294467926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5911177396774292, + "epoch": 9.15, + "learning_rate": 4.260355029585799e-06, + "loss": 0.3982, + "step": 10822, + "task_loss": 0.5370573401451111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3472902178764343, + "epoch": 9.15, + "learning_rate": 4.2561284868977185e-06, + "loss": 0.3437, + "step": 10823, + "task_loss": 0.04951505735516548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3588460683822632, + "epoch": 9.15, + "learning_rate": 4.251901944209637e-06, + "loss": 0.3307, + "step": 10824, + "task_loss": 0.24767516553401947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39019525051116943, + "epoch": 9.15, + "learning_rate": 4.2476754015215555e-06, + "loss": 0.431, + "step": 10825, + "task_loss": 0.35043588280677795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2820150852203369, + "epoch": 9.15, + "learning_rate": 4.243448858833474e-06, + "loss": 0.313, + "step": 10826, + "task_loss": 0.5107693076133728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2387271523475647, + "epoch": 9.15, + "learning_rate": 4.239222316145393e-06, + "loss": 0.4, + "step": 10827, + "task_loss": 1.157117486000061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2085588276386261, + "epoch": 9.15, + "learning_rate": 4.234995773457312e-06, + "loss": 0.3592, + "step": 10828, + "task_loss": 0.48905205726623535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6823012828826904, + "epoch": 9.15, + "learning_rate": 4.230769230769231e-06, + "loss": 0.4801, + "step": 10829, + "task_loss": 0.46268802881240845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20702511072158813, + "epoch": 9.15, + "learning_rate": 4.22654268808115e-06, + "loss": 0.2952, + "step": 10830, + "task_loss": 0.8787353038787842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.329217791557312, + "epoch": 9.16, + "learning_rate": 4.222316145393069e-06, + "loss": 0.4165, + "step": 10831, + "task_loss": 0.5806611180305481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49163514375686646, + "epoch": 9.16, + "learning_rate": 4.218089602704988e-06, + "loss": 0.428, + "step": 10832, + "task_loss": 0.40630850195884705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30569207668304443, + "epoch": 9.16, + "learning_rate": 4.213863060016907e-06, + "loss": 0.3955, + "step": 10833, + "task_loss": 0.4833647608757019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32855385541915894, + "epoch": 9.16, + "learning_rate": 4.209636517328825e-06, + "loss": 0.3755, + "step": 10834, + "task_loss": 0.917308509349823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38809335231781006, + "epoch": 9.16, + "learning_rate": 4.205409974640744e-06, + "loss": 0.3976, + "step": 10835, + "task_loss": 0.5272912979125977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2863474190235138, + "epoch": 9.16, + "learning_rate": 4.2011834319526624e-06, + "loss": 0.3453, + "step": 10836, + "task_loss": 0.7223553657531738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44409701228141785, + "epoch": 9.16, + "learning_rate": 4.196956889264582e-06, + "loss": 0.3974, + "step": 10837, + "task_loss": 0.6640463471412659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24882397055625916, + "epoch": 9.16, + "learning_rate": 4.1927303465765e-06, + "loss": 0.4732, + "step": 10838, + "task_loss": 0.4024899899959564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3268924355506897, + "epoch": 9.16, + "learning_rate": 4.18850380388842e-06, + "loss": 0.348, + "step": 10839, + "task_loss": 0.2844221591949463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3103012442588806, + "epoch": 9.16, + "learning_rate": 4.184277261200338e-06, + "loss": 0.3963, + "step": 10840, + "task_loss": 0.6189307570457458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3232678472995758, + "epoch": 9.16, + "learning_rate": 4.180050718512257e-06, + "loss": 0.417, + "step": 10841, + "task_loss": 0.6324960589408875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2506742477416992, + "epoch": 9.16, + "learning_rate": 4.175824175824176e-06, + "loss": 0.3506, + "step": 10842, + "task_loss": 0.44172731041908264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.279115229845047, + "epoch": 9.17, + "learning_rate": 4.171597633136095e-06, + "loss": 0.3087, + "step": 10843, + "task_loss": 1.0566439628601074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23468446731567383, + "epoch": 9.17, + "learning_rate": 4.167371090448014e-06, + "loss": 0.3883, + "step": 10844, + "task_loss": 0.24102312326431274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23641732335090637, + "epoch": 9.17, + "learning_rate": 4.1631445477599325e-06, + "loss": 0.3168, + "step": 10845, + "task_loss": 0.23073643445968628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2851032018661499, + "epoch": 9.17, + "learning_rate": 4.158918005071851e-06, + "loss": 0.3434, + "step": 10846, + "task_loss": 0.6747862696647644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.493721604347229, + "epoch": 9.17, + "learning_rate": 4.15469146238377e-06, + "loss": 0.3222, + "step": 10847, + "task_loss": 0.8560011386871338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36523839831352234, + "epoch": 9.17, + "learning_rate": 4.150464919695689e-06, + "loss": 0.3498, + "step": 10848, + "task_loss": 0.5505246520042419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49917393922805786, + "epoch": 9.17, + "learning_rate": 4.146238377007608e-06, + "loss": 0.4057, + "step": 10849, + "task_loss": 0.5482868552207947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1424364447593689, + "epoch": 9.17, + "learning_rate": 4.142011834319527e-06, + "loss": 0.3282, + "step": 10850, + "task_loss": 0.5905613899230957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33855384588241577, + "epoch": 9.17, + "learning_rate": 4.137785291631446e-06, + "loss": 0.4336, + "step": 10851, + "task_loss": 0.5299764275550842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2510131597518921, + "epoch": 9.17, + "learning_rate": 4.133558748943365e-06, + "loss": 0.2721, + "step": 10852, + "task_loss": 0.905805230140686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2865440845489502, + "epoch": 9.17, + "learning_rate": 4.129332206255284e-06, + "loss": 0.4205, + "step": 10853, + "task_loss": 1.0691314935684204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4508817195892334, + "epoch": 9.17, + "learning_rate": 4.1251056635672025e-06, + "loss": 0.4624, + "step": 10854, + "task_loss": 0.46122539043426514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3504142463207245, + "epoch": 9.18, + "learning_rate": 4.120879120879121e-06, + "loss": 0.3559, + "step": 10855, + "task_loss": 0.6790086030960083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27813583612442017, + "epoch": 9.18, + "learning_rate": 4.11665257819104e-06, + "loss": 0.4212, + "step": 10856, + "task_loss": 0.6236938834190369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3508034348487854, + "epoch": 9.18, + "learning_rate": 4.112426035502958e-06, + "loss": 0.3861, + "step": 10857, + "task_loss": 0.34262847900390625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.509964108467102, + "epoch": 9.18, + "learning_rate": 4.108199492814878e-06, + "loss": 0.3654, + "step": 10858, + "task_loss": 0.2515036165714264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4743852913379669, + "epoch": 9.18, + "learning_rate": 4.103972950126796e-06, + "loss": 0.3272, + "step": 10859, + "task_loss": 0.9125486612319946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14461076259613037, + "epoch": 9.18, + "learning_rate": 4.099746407438716e-06, + "loss": 0.2916, + "step": 10860, + "task_loss": 0.2800450921058655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40469595789909363, + "epoch": 9.18, + "learning_rate": 4.095519864750634e-06, + "loss": 0.3647, + "step": 10861, + "task_loss": 0.8845595121383667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29161354899406433, + "epoch": 9.18, + "learning_rate": 4.091293322062554e-06, + "loss": 0.3104, + "step": 10862, + "task_loss": 0.5635102987289429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24710771441459656, + "epoch": 9.18, + "learning_rate": 4.087066779374472e-06, + "loss": 0.2471, + "step": 10863, + "task_loss": 0.31261780858039856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2973331809043884, + "epoch": 9.18, + "learning_rate": 4.0828402366863906e-06, + "loss": 0.4577, + "step": 10864, + "task_loss": 0.9894577860832214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23789219558238983, + "epoch": 9.18, + "learning_rate": 4.0786136939983095e-06, + "loss": 0.3082, + "step": 10865, + "task_loss": 0.3083100914955139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4020971953868866, + "epoch": 9.19, + "learning_rate": 4.074387151310228e-06, + "loss": 0.413, + "step": 10866, + "task_loss": 1.3575773239135742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23139192163944244, + "epoch": 9.19, + "learning_rate": 4.070160608622147e-06, + "loss": 0.3787, + "step": 10867, + "task_loss": 0.7296725511550903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2534216344356537, + "epoch": 9.19, + "learning_rate": 4.065934065934066e-06, + "loss": 0.3212, + "step": 10868, + "task_loss": 0.7202332019805908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17211177945137024, + "epoch": 9.19, + "learning_rate": 4.061707523245985e-06, + "loss": 0.3443, + "step": 10869, + "task_loss": 0.01822839118540287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4365965723991394, + "epoch": 9.19, + "learning_rate": 4.057480980557904e-06, + "loss": 0.3509, + "step": 10870, + "task_loss": 0.42886412143707275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6573877930641174, + "epoch": 9.19, + "learning_rate": 4.053254437869823e-06, + "loss": 0.3724, + "step": 10871, + "task_loss": 0.6392317414283752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4887334406375885, + "epoch": 9.19, + "learning_rate": 4.049027895181742e-06, + "loss": 0.5488, + "step": 10872, + "task_loss": 0.7437435984611511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16104495525360107, + "epoch": 9.19, + "learning_rate": 4.04480135249366e-06, + "loss": 0.2974, + "step": 10873, + "task_loss": 0.3282138407230377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24984420835971832, + "epoch": 9.19, + "learning_rate": 4.0405748098055795e-06, + "loss": 0.4116, + "step": 10874, + "task_loss": 0.48430758714675903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2619593143463135, + "epoch": 9.19, + "learning_rate": 4.0363482671174975e-06, + "loss": 0.2358, + "step": 10875, + "task_loss": 0.6308826208114624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38115665316581726, + "epoch": 9.19, + "learning_rate": 4.032121724429417e-06, + "loss": 0.4476, + "step": 10876, + "task_loss": 0.6427804827690125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39664244651794434, + "epoch": 9.19, + "learning_rate": 4.027895181741335e-06, + "loss": 0.4083, + "step": 10877, + "task_loss": 0.5090481042861938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22147826850414276, + "epoch": 9.2, + "learning_rate": 4.023668639053255e-06, + "loss": 0.3935, + "step": 10878, + "task_loss": 0.06252767145633698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.443624883890152, + "epoch": 9.2, + "learning_rate": 4.019442096365174e-06, + "loss": 0.4139, + "step": 10879, + "task_loss": 0.8653573393821716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4937755763530731, + "epoch": 9.2, + "learning_rate": 4.015215553677092e-06, + "loss": 0.5055, + "step": 10880, + "task_loss": 1.0028654336929321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3918817341327667, + "epoch": 9.2, + "learning_rate": 4.010989010989012e-06, + "loss": 0.397, + "step": 10881, + "task_loss": 0.3180001676082611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3301582634449005, + "epoch": 9.2, + "learning_rate": 4.00676246830093e-06, + "loss": 0.4935, + "step": 10882, + "task_loss": 1.3016095161437988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5090761184692383, + "epoch": 9.2, + "learning_rate": 4.0025359256128495e-06, + "loss": 0.4918, + "step": 10883, + "task_loss": 0.9981525540351868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.245442196726799, + "epoch": 9.2, + "learning_rate": 3.9983093829247675e-06, + "loss": 0.266, + "step": 10884, + "task_loss": 0.15953753888607025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2800212502479553, + "epoch": 9.2, + "learning_rate": 3.9940828402366864e-06, + "loss": 0.4029, + "step": 10885, + "task_loss": 0.5439020991325378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42538100481033325, + "epoch": 9.2, + "learning_rate": 3.989856297548605e-06, + "loss": 0.4544, + "step": 10886, + "task_loss": 0.35914820432662964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31956934928894043, + "epoch": 9.2, + "learning_rate": 3.985629754860524e-06, + "loss": 0.3918, + "step": 10887, + "task_loss": 0.4242209792137146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1609598994255066, + "epoch": 9.2, + "learning_rate": 3.981403212172443e-06, + "loss": 0.2285, + "step": 10888, + "task_loss": 0.5662160515785217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6493076682090759, + "epoch": 9.2, + "learning_rate": 3.977176669484362e-06, + "loss": 0.4032, + "step": 10889, + "task_loss": 1.171083688735962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.421529620885849, + "epoch": 9.21, + "learning_rate": 3.972950126796281e-06, + "loss": 0.3771, + "step": 10890, + "task_loss": 0.49656879901885986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2736101746559143, + "epoch": 9.21, + "learning_rate": 3.9687235841082e-06, + "loss": 0.3418, + "step": 10891, + "task_loss": 0.10867220163345337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32187724113464355, + "epoch": 9.21, + "learning_rate": 3.964497041420119e-06, + "loss": 0.4061, + "step": 10892, + "task_loss": 0.44555678963661194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3557005822658539, + "epoch": 9.21, + "learning_rate": 3.9602704987320376e-06, + "loss": 0.3318, + "step": 10893, + "task_loss": 0.5973302125930786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3964943289756775, + "epoch": 9.21, + "learning_rate": 3.9560439560439565e-06, + "loss": 0.2838, + "step": 10894, + "task_loss": 0.47388774156570435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26992467045783997, + "epoch": 9.21, + "learning_rate": 3.951817413355875e-06, + "loss": 0.3673, + "step": 10895, + "task_loss": 0.37616825103759766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2727281451225281, + "epoch": 9.21, + "learning_rate": 3.947590870667793e-06, + "loss": 0.3779, + "step": 10896, + "task_loss": 0.5296937823295593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1504100263118744, + "epoch": 9.21, + "learning_rate": 3.943364327979713e-06, + "loss": 0.2578, + "step": 10897, + "task_loss": 0.8117237091064453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2810574769973755, + "epoch": 9.21, + "learning_rate": 3.939137785291631e-06, + "loss": 0.2518, + "step": 10898, + "task_loss": 0.1291680783033371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.418807715177536, + "epoch": 9.21, + "learning_rate": 3.934911242603551e-06, + "loss": 0.3558, + "step": 10899, + "task_loss": 1.055646300315857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5120546221733093, + "epoch": 9.21, + "learning_rate": 3.930684699915469e-06, + "loss": 0.3928, + "step": 10900, + "task_loss": 0.8648068904876709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47959765791893005, + "epoch": 9.21, + "learning_rate": 3.926458157227388e-06, + "loss": 0.4048, + "step": 10901, + "task_loss": 0.3264666497707367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2359699010848999, + "epoch": 9.22, + "learning_rate": 3.922231614539307e-06, + "loss": 0.3088, + "step": 10902, + "task_loss": 0.6279082298278809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19544366002082825, + "epoch": 9.22, + "learning_rate": 3.918005071851226e-06, + "loss": 0.3315, + "step": 10903, + "task_loss": 0.05388445779681206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27638497948646545, + "epoch": 9.22, + "learning_rate": 3.9137785291631445e-06, + "loss": 0.3884, + "step": 10904, + "task_loss": 0.5018625259399414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2042871117591858, + "epoch": 9.22, + "learning_rate": 3.909551986475063e-06, + "loss": 0.3475, + "step": 10905, + "task_loss": 0.5975039005279541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3266107738018036, + "epoch": 9.22, + "learning_rate": 3.905325443786982e-06, + "loss": 0.3685, + "step": 10906, + "task_loss": 1.1789569854736328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6518645882606506, + "epoch": 9.22, + "learning_rate": 3.901098901098901e-06, + "loss": 0.4929, + "step": 10907, + "task_loss": 1.005140781402588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2009197473526001, + "epoch": 9.22, + "learning_rate": 3.89687235841082e-06, + "loss": 0.3198, + "step": 10908, + "task_loss": 0.5147883296012878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4295263886451721, + "epoch": 9.22, + "learning_rate": 3.892645815722739e-06, + "loss": 0.37, + "step": 10909, + "task_loss": 1.220171570777893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31452563405036926, + "epoch": 9.22, + "learning_rate": 3.888419273034658e-06, + "loss": 0.3429, + "step": 10910, + "task_loss": 0.4227457642555237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3117865324020386, + "epoch": 9.22, + "learning_rate": 3.884192730346577e-06, + "loss": 0.3468, + "step": 10911, + "task_loss": 0.30923670530319214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18727166950702667, + "epoch": 9.22, + "learning_rate": 3.879966187658496e-06, + "loss": 0.2574, + "step": 10912, + "task_loss": 0.5415329337120056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2038675844669342, + "epoch": 9.22, + "learning_rate": 3.8757396449704146e-06, + "loss": 0.2664, + "step": 10913, + "task_loss": 0.9026894569396973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.52046138048172, + "epoch": 9.23, + "learning_rate": 3.8715131022823334e-06, + "loss": 0.5213, + "step": 10914, + "task_loss": 0.6144980192184448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3290405869483948, + "epoch": 9.23, + "learning_rate": 3.867286559594252e-06, + "loss": 0.2744, + "step": 10915, + "task_loss": 1.0736050605773926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7283310294151306, + "epoch": 9.23, + "learning_rate": 3.863060016906171e-06, + "loss": 0.4392, + "step": 10916, + "task_loss": 0.8360440135002136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.302859902381897, + "epoch": 9.23, + "learning_rate": 3.858833474218089e-06, + "loss": 0.3762, + "step": 10917, + "task_loss": 0.7501512169837952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4375344514846802, + "epoch": 9.23, + "learning_rate": 3.854606931530009e-06, + "loss": 0.4478, + "step": 10918, + "task_loss": 0.34103307127952576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5863479375839233, + "epoch": 9.23, + "learning_rate": 3.850380388841927e-06, + "loss": 0.4381, + "step": 10919, + "task_loss": 0.33770954608917236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3453470766544342, + "epoch": 9.23, + "learning_rate": 3.846153846153847e-06, + "loss": 0.368, + "step": 10920, + "task_loss": 0.3951333165168762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30058974027633667, + "epoch": 9.23, + "learning_rate": 3.841927303465765e-06, + "loss": 0.3175, + "step": 10921, + "task_loss": 0.44761788845062256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26135939359664917, + "epoch": 9.23, + "learning_rate": 3.837700760777685e-06, + "loss": 0.3161, + "step": 10922, + "task_loss": 0.944241464138031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6498162746429443, + "epoch": 9.23, + "learning_rate": 3.833474218089603e-06, + "loss": 0.5597, + "step": 10923, + "task_loss": 0.43873777985572815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17337553203105927, + "epoch": 9.23, + "learning_rate": 3.8292476754015215e-06, + "loss": 0.2333, + "step": 10924, + "task_loss": 0.2576712667942047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.276930570602417, + "epoch": 9.23, + "learning_rate": 3.82502113271344e-06, + "loss": 0.3794, + "step": 10925, + "task_loss": 0.9625747203826904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27496975660324097, + "epoch": 9.24, + "learning_rate": 3.820794590025359e-06, + "loss": 0.3037, + "step": 10926, + "task_loss": 0.47768476605415344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26467365026474, + "epoch": 9.24, + "learning_rate": 3.816568047337278e-06, + "loss": 0.3473, + "step": 10927, + "task_loss": 0.44053125381469727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2823783755302429, + "epoch": 9.24, + "learning_rate": 3.812341504649197e-06, + "loss": 0.4024, + "step": 10928, + "task_loss": 0.3518713414669037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2675750255584717, + "epoch": 9.24, + "learning_rate": 3.8081149619611155e-06, + "loss": 0.2754, + "step": 10929, + "task_loss": 0.4231777787208557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.281505823135376, + "epoch": 9.24, + "learning_rate": 3.803888419273035e-06, + "loss": 0.4318, + "step": 10930, + "task_loss": 0.8863570690155029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24458247423171997, + "epoch": 9.24, + "learning_rate": 3.7996618765849533e-06, + "loss": 0.2778, + "step": 10931, + "task_loss": 0.5408713817596436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37484899163246155, + "epoch": 9.24, + "learning_rate": 3.7954353338968726e-06, + "loss": 0.3493, + "step": 10932, + "task_loss": 0.11423295736312866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.412622332572937, + "epoch": 9.24, + "learning_rate": 3.791208791208791e-06, + "loss": 0.3536, + "step": 10933, + "task_loss": 0.6945928335189819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4011216163635254, + "epoch": 9.24, + "learning_rate": 3.7869822485207104e-06, + "loss": 0.331, + "step": 10934, + "task_loss": 1.1550707817077637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28640639781951904, + "epoch": 9.24, + "learning_rate": 3.782755705832629e-06, + "loss": 0.3988, + "step": 10935, + "task_loss": 0.36315611004829407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3395368158817291, + "epoch": 9.24, + "learning_rate": 3.778529163144548e-06, + "loss": 0.4247, + "step": 10936, + "task_loss": 0.316150963306427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31310397386550903, + "epoch": 9.24, + "learning_rate": 3.7743026204564663e-06, + "loss": 0.3517, + "step": 10937, + "task_loss": 0.5538923144340515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3481917977333069, + "epoch": 9.25, + "learning_rate": 3.7700760777683856e-06, + "loss": 0.3437, + "step": 10938, + "task_loss": 0.7409000396728516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4529663026332855, + "epoch": 9.25, + "learning_rate": 3.765849535080305e-06, + "loss": 0.3874, + "step": 10939, + "task_loss": 0.6403841972351074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25365138053894043, + "epoch": 9.25, + "learning_rate": 3.7616229923922234e-06, + "loss": 0.3634, + "step": 10940, + "task_loss": 0.15202222764492035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5728182196617126, + "epoch": 9.25, + "learning_rate": 3.7573964497041427e-06, + "loss": 0.4113, + "step": 10941, + "task_loss": 1.1576825380325317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23296308517456055, + "epoch": 9.25, + "learning_rate": 3.753169907016061e-06, + "loss": 0.3967, + "step": 10942, + "task_loss": 0.05683799833059311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.380463570356369, + "epoch": 9.25, + "learning_rate": 3.74894336432798e-06, + "loss": 0.3772, + "step": 10943, + "task_loss": 1.3070838451385498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3895018398761749, + "epoch": 9.25, + "learning_rate": 3.7447168216398985e-06, + "loss": 0.3434, + "step": 10944, + "task_loss": 0.6772066950798035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2726972699165344, + "epoch": 9.25, + "learning_rate": 3.740490278951818e-06, + "loss": 0.3081, + "step": 10945, + "task_loss": 0.9403440952301025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.750728964805603, + "epoch": 9.25, + "learning_rate": 3.7362637362637363e-06, + "loss": 0.6266, + "step": 10946, + "task_loss": 0.8400091528892517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2021588534116745, + "epoch": 9.25, + "learning_rate": 3.7320371935756556e-06, + "loss": 0.2756, + "step": 10947, + "task_loss": 0.25848424434661865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6216771602630615, + "epoch": 9.25, + "learning_rate": 3.727810650887574e-06, + "loss": 0.4214, + "step": 10948, + "task_loss": 0.4798125624656677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20072919130325317, + "epoch": 9.26, + "learning_rate": 3.7235841081994934e-06, + "loss": 0.3727, + "step": 10949, + "task_loss": 1.859929084777832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21267297863960266, + "epoch": 9.26, + "learning_rate": 3.719357565511412e-06, + "loss": 0.3265, + "step": 10950, + "task_loss": 0.3794448673725128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2294258028268814, + "epoch": 9.26, + "learning_rate": 3.7151310228233307e-06, + "loss": 0.3656, + "step": 10951, + "task_loss": 0.5274657011032104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28940099477767944, + "epoch": 9.26, + "learning_rate": 3.710904480135249e-06, + "loss": 0.3166, + "step": 10952, + "task_loss": 1.2432137727737427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5230440497398376, + "epoch": 9.26, + "learning_rate": 3.7066779374471685e-06, + "loss": 0.4425, + "step": 10953, + "task_loss": 0.23548617959022522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.328269898891449, + "epoch": 9.26, + "learning_rate": 3.702451394759087e-06, + "loss": 0.3651, + "step": 10954, + "task_loss": 1.5223937034606934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4088091552257538, + "epoch": 9.26, + "learning_rate": 3.6982248520710063e-06, + "loss": 0.3338, + "step": 10955, + "task_loss": 0.39432278275489807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30290836095809937, + "epoch": 9.26, + "learning_rate": 3.6939983093829248e-06, + "loss": 0.2839, + "step": 10956, + "task_loss": 0.3671436905860901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3768446743488312, + "epoch": 9.26, + "learning_rate": 3.689771766694844e-06, + "loss": 0.3686, + "step": 10957, + "task_loss": 0.43358027935028076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.272566556930542, + "epoch": 9.26, + "learning_rate": 3.6855452240067626e-06, + "loss": 0.3659, + "step": 10958, + "task_loss": 0.5586867928504944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21464037895202637, + "epoch": 9.26, + "learning_rate": 3.6813186813186814e-06, + "loss": 0.3118, + "step": 10959, + "task_loss": 0.4399377107620239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27069610357284546, + "epoch": 9.26, + "learning_rate": 3.6770921386306e-06, + "loss": 0.352, + "step": 10960, + "task_loss": 0.7179108262062073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3811045289039612, + "epoch": 9.27, + "learning_rate": 3.6728655959425192e-06, + "loss": 0.4334, + "step": 10961, + "task_loss": 0.7550804615020752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3838531970977783, + "epoch": 9.27, + "learning_rate": 3.6686390532544377e-06, + "loss": 0.4031, + "step": 10962, + "task_loss": 0.5705326795578003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24511802196502686, + "epoch": 9.27, + "learning_rate": 3.664412510566357e-06, + "loss": 0.3077, + "step": 10963, + "task_loss": 1.2790706157684326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37313956022262573, + "epoch": 9.27, + "learning_rate": 3.6601859678782755e-06, + "loss": 0.3109, + "step": 10964, + "task_loss": 0.2583644390106201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2917196452617645, + "epoch": 9.27, + "learning_rate": 3.655959425190195e-06, + "loss": 0.2744, + "step": 10965, + "task_loss": 0.3394607603549957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33091214299201965, + "epoch": 9.27, + "learning_rate": 3.6517328825021133e-06, + "loss": 0.3297, + "step": 10966, + "task_loss": 0.2677742540836334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16183951497077942, + "epoch": 9.27, + "learning_rate": 3.647506339814032e-06, + "loss": 0.2684, + "step": 10967, + "task_loss": 0.12940217554569244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4263492822647095, + "epoch": 9.27, + "learning_rate": 3.6432797971259506e-06, + "loss": 0.3809, + "step": 10968, + "task_loss": 0.7249758839607239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3830980956554413, + "epoch": 9.27, + "learning_rate": 3.63905325443787e-06, + "loss": 0.3199, + "step": 10969, + "task_loss": 0.4624652564525604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3715564012527466, + "epoch": 9.27, + "learning_rate": 3.6348267117497893e-06, + "loss": 0.4076, + "step": 10970, + "task_loss": 0.09809459745883942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.450778067111969, + "epoch": 9.27, + "learning_rate": 3.6306001690617077e-06, + "loss": 0.4605, + "step": 10971, + "task_loss": 1.3819876909255981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22241252660751343, + "epoch": 9.27, + "learning_rate": 3.626373626373627e-06, + "loss": 0.4126, + "step": 10972, + "task_loss": 0.10224239528179169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3260951638221741, + "epoch": 9.28, + "learning_rate": 3.6221470836855455e-06, + "loss": 0.3765, + "step": 10973, + "task_loss": 0.2895330786705017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40865862369537354, + "epoch": 9.28, + "learning_rate": 3.6179205409974644e-06, + "loss": 0.3444, + "step": 10974, + "task_loss": 0.3284333050251007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44510844349861145, + "epoch": 9.28, + "learning_rate": 3.613693998309383e-06, + "loss": 0.3402, + "step": 10975, + "task_loss": 0.2670443058013916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6085115671157837, + "epoch": 9.28, + "learning_rate": 3.609467455621302e-06, + "loss": 0.4282, + "step": 10976, + "task_loss": 0.2907293140888214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32963934540748596, + "epoch": 9.28, + "learning_rate": 3.6052409129332206e-06, + "loss": 0.4219, + "step": 10977, + "task_loss": 0.8998923897743225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3749598264694214, + "epoch": 9.28, + "learning_rate": 3.60101437024514e-06, + "loss": 0.3874, + "step": 10978, + "task_loss": 0.37733250856399536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31837302446365356, + "epoch": 9.28, + "learning_rate": 3.5967878275570584e-06, + "loss": 0.2408, + "step": 10979, + "task_loss": 0.8546398878097534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31893807649612427, + "epoch": 9.28, + "learning_rate": 3.5925612848689777e-06, + "loss": 0.3398, + "step": 10980, + "task_loss": 0.9907719492912292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.543782651424408, + "epoch": 9.28, + "learning_rate": 3.5883347421808962e-06, + "loss": 0.4062, + "step": 10981, + "task_loss": 1.19627845287323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2998760938644409, + "epoch": 9.28, + "learning_rate": 3.584108199492815e-06, + "loss": 0.3882, + "step": 10982, + "task_loss": 0.6442987322807312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3204306960105896, + "epoch": 9.28, + "learning_rate": 3.5798816568047336e-06, + "loss": 0.328, + "step": 10983, + "task_loss": 0.5392677187919617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17468169331550598, + "epoch": 9.28, + "learning_rate": 3.575655114116653e-06, + "loss": 0.2504, + "step": 10984, + "task_loss": 1.106391429901123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17096957564353943, + "epoch": 9.29, + "learning_rate": 3.5714285714285714e-06, + "loss": 0.3143, + "step": 10985, + "task_loss": 0.11747336387634277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2537137567996979, + "epoch": 9.29, + "learning_rate": 3.5672020287404907e-06, + "loss": 0.3988, + "step": 10986, + "task_loss": 0.5070047974586487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3965149223804474, + "epoch": 9.29, + "learning_rate": 3.562975486052409e-06, + "loss": 0.4141, + "step": 10987, + "task_loss": 0.7759620547294617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4468211829662323, + "epoch": 9.29, + "learning_rate": 3.5587489433643285e-06, + "loss": 0.4267, + "step": 10988, + "task_loss": 1.0616810321807861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2447449117898941, + "epoch": 9.29, + "learning_rate": 3.554522400676247e-06, + "loss": 0.3487, + "step": 10989, + "task_loss": 0.21285954117774963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34608194231987, + "epoch": 9.29, + "learning_rate": 3.550295857988166e-06, + "loss": 0.3429, + "step": 10990, + "task_loss": 0.14111295342445374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3419375717639923, + "epoch": 9.29, + "learning_rate": 3.5460693153000843e-06, + "loss": 0.3183, + "step": 10991, + "task_loss": 0.5419254302978516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42309314012527466, + "epoch": 9.29, + "learning_rate": 3.5418427726120036e-06, + "loss": 0.3111, + "step": 10992, + "task_loss": 0.4497659504413605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6240777373313904, + "epoch": 9.29, + "learning_rate": 3.537616229923922e-06, + "loss": 0.3926, + "step": 10993, + "task_loss": 1.184451937675476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18123583495616913, + "epoch": 9.29, + "learning_rate": 3.5333896872358414e-06, + "loss": 0.2873, + "step": 10994, + "task_loss": 1.2545934915542603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5459150075912476, + "epoch": 9.29, + "learning_rate": 3.52916314454776e-06, + "loss": 0.3628, + "step": 10995, + "task_loss": 0.7613987326622009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33714455366134644, + "epoch": 9.29, + "learning_rate": 3.524936601859679e-06, + "loss": 0.3338, + "step": 10996, + "task_loss": 0.5930745601654053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4005102217197418, + "epoch": 9.3, + "learning_rate": 3.5207100591715976e-06, + "loss": 0.4147, + "step": 10997, + "task_loss": 0.21574874222278595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.271607905626297, + "epoch": 9.3, + "learning_rate": 3.5164835164835165e-06, + "loss": 0.2933, + "step": 10998, + "task_loss": 0.2806920111179352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3341159522533417, + "epoch": 9.3, + "learning_rate": 3.512256973795436e-06, + "loss": 0.3537, + "step": 10999, + "task_loss": 0.31831231713294983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2314402461051941, + "epoch": 9.3, + "learning_rate": 3.5080304311073543e-06, + "loss": 0.3196, + "step": 11000, + "task_loss": 0.11587528139352798 + }, + { + "epoch": 9.3, + "eval_accuracy": 0.917980198019802, + "eval_loss": 0.24436061084270477, + "eval_runtime": 225.5759, + "eval_samples_per_second": 111.936, + "eval_steps_per_second": 0.878, + "step": 11000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5138021111488342, + "epoch": 9.3, + "learning_rate": 3.5038038884192736e-06, + "loss": 0.4471, + "step": 11001, + "task_loss": 0.8041527271270752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39428335428237915, + "epoch": 9.3, + "learning_rate": 3.499577345731192e-06, + "loss": 0.3563, + "step": 11002, + "task_loss": 1.0692877769470215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3546184301376343, + "epoch": 9.3, + "learning_rate": 3.495350803043111e-06, + "loss": 0.4144, + "step": 11003, + "task_loss": 0.7868075370788574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5297516584396362, + "epoch": 9.3, + "learning_rate": 3.49112426035503e-06, + "loss": 0.3983, + "step": 11004, + "task_loss": 1.1671717166900635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2653290331363678, + "epoch": 9.3, + "learning_rate": 3.4868977176669488e-06, + "loss": 0.214, + "step": 11005, + "task_loss": 0.04818599671125412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4491077959537506, + "epoch": 9.3, + "learning_rate": 3.4826711749788672e-06, + "loss": 0.4928, + "step": 11006, + "task_loss": 0.13281863927841187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2281665951013565, + "epoch": 9.3, + "learning_rate": 3.4784446322907865e-06, + "loss": 0.3434, + "step": 11007, + "task_loss": 0.785679042339325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3459298014640808, + "epoch": 9.3, + "learning_rate": 3.474218089602705e-06, + "loss": 0.3259, + "step": 11008, + "task_loss": 0.5178593397140503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.10948862135410309, + "epoch": 9.31, + "learning_rate": 3.4699915469146243e-06, + "loss": 0.273, + "step": 11009, + "task_loss": 0.01376782450824976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28809839487075806, + "epoch": 9.31, + "learning_rate": 3.465765004226543e-06, + "loss": 0.3482, + "step": 11010, + "task_loss": 0.4283992350101471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26329946517944336, + "epoch": 9.31, + "learning_rate": 3.4615384615384617e-06, + "loss": 0.3507, + "step": 11011, + "task_loss": 0.3022018074989319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6969354748725891, + "epoch": 9.31, + "learning_rate": 3.4573119188503806e-06, + "loss": 0.45, + "step": 11012, + "task_loss": 0.19120028614997864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34133753180503845, + "epoch": 9.31, + "learning_rate": 3.4530853761622995e-06, + "loss": 0.2691, + "step": 11013, + "task_loss": 0.49421724677085876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3515082001686096, + "epoch": 9.31, + "learning_rate": 3.448858833474218e-06, + "loss": 0.3355, + "step": 11014, + "task_loss": 0.7741529941558838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40698665380477905, + "epoch": 9.31, + "learning_rate": 3.4446322907861373e-06, + "loss": 0.3268, + "step": 11015, + "task_loss": 1.5545374155044556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26684123277664185, + "epoch": 9.31, + "learning_rate": 3.4404057480980557e-06, + "loss": 0.417, + "step": 11016, + "task_loss": 0.7231999039649963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5027421116828918, + "epoch": 9.31, + "learning_rate": 3.436179205409975e-06, + "loss": 0.4991, + "step": 11017, + "task_loss": 0.9126652479171753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31330594420433044, + "epoch": 9.31, + "learning_rate": 3.4319526627218935e-06, + "loss": 0.3356, + "step": 11018, + "task_loss": 0.4716689884662628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4825432598590851, + "epoch": 9.31, + "learning_rate": 3.4277261200338124e-06, + "loss": 0.4306, + "step": 11019, + "task_loss": 0.6834752559661865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27844345569610596, + "epoch": 9.32, + "learning_rate": 3.4234995773457313e-06, + "loss": 0.4129, + "step": 11020, + "task_loss": 0.6349384784698486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.350713849067688, + "epoch": 9.32, + "learning_rate": 3.41927303465765e-06, + "loss": 0.3601, + "step": 11021, + "task_loss": 0.7604373097419739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18268582224845886, + "epoch": 9.32, + "learning_rate": 3.4150464919695687e-06, + "loss": 0.3273, + "step": 11022, + "task_loss": 0.5454663038253784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28187012672424316, + "epoch": 9.32, + "learning_rate": 3.410819949281488e-06, + "loss": 0.3841, + "step": 11023, + "task_loss": 0.6284623742103577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45291391015052795, + "epoch": 9.32, + "learning_rate": 3.4065934065934064e-06, + "loss": 0.3761, + "step": 11024, + "task_loss": 0.7281866073608398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35542598366737366, + "epoch": 9.32, + "learning_rate": 3.4023668639053257e-06, + "loss": 0.2719, + "step": 11025, + "task_loss": 0.6919620633125305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19045458734035492, + "epoch": 9.32, + "learning_rate": 3.3981403212172442e-06, + "loss": 0.3264, + "step": 11026, + "task_loss": 0.6196629405021667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6128476858139038, + "epoch": 9.32, + "learning_rate": 3.393913778529163e-06, + "loss": 0.345, + "step": 11027, + "task_loss": 0.5651718378067017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6871416568756104, + "epoch": 9.32, + "learning_rate": 3.3896872358410824e-06, + "loss": 0.4755, + "step": 11028, + "task_loss": 1.1522578001022339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1961861103773117, + "epoch": 9.32, + "learning_rate": 3.385460693153001e-06, + "loss": 0.3857, + "step": 11029, + "task_loss": 1.2403775453567505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4689747095108032, + "epoch": 9.32, + "learning_rate": 3.38123415046492e-06, + "loss": 0.4099, + "step": 11030, + "task_loss": 0.35707172751426697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3622818887233734, + "epoch": 9.32, + "learning_rate": 3.3770076077768387e-06, + "loss": 0.3056, + "step": 11031, + "task_loss": 0.22565320134162903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45878249406814575, + "epoch": 9.33, + "learning_rate": 3.372781065088758e-06, + "loss": 0.372, + "step": 11032, + "task_loss": 0.47638577222824097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30390945076942444, + "epoch": 9.33, + "learning_rate": 3.3685545224006765e-06, + "loss": 0.3247, + "step": 11033, + "task_loss": 0.363949179649353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2505924105644226, + "epoch": 9.33, + "learning_rate": 3.3643279797125954e-06, + "loss": 0.3907, + "step": 11034, + "task_loss": 0.331704318523407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2890142500400543, + "epoch": 9.33, + "learning_rate": 3.360101437024514e-06, + "loss": 0.3096, + "step": 11035, + "task_loss": 0.5168416500091553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22444498538970947, + "epoch": 9.33, + "learning_rate": 3.355874894336433e-06, + "loss": 0.3775, + "step": 11036, + "task_loss": 0.19301094114780426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21093818545341492, + "epoch": 9.33, + "learning_rate": 3.3516483516483516e-06, + "loss": 0.3235, + "step": 11037, + "task_loss": 0.1737367957830429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3835913836956024, + "epoch": 9.33, + "learning_rate": 3.347421808960271e-06, + "loss": 0.3997, + "step": 11038, + "task_loss": 1.3739631175994873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30857211351394653, + "epoch": 9.33, + "learning_rate": 3.3431952662721894e-06, + "loss": 0.3385, + "step": 11039, + "task_loss": 0.36353522539138794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2637791633605957, + "epoch": 9.33, + "learning_rate": 3.3389687235841087e-06, + "loss": 0.336, + "step": 11040, + "task_loss": 0.6328445076942444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3103238642215729, + "epoch": 9.33, + "learning_rate": 3.334742180896027e-06, + "loss": 0.327, + "step": 11041, + "task_loss": 0.29911383986473083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2658078074455261, + "epoch": 9.33, + "learning_rate": 3.330515638207946e-06, + "loss": 0.411, + "step": 11042, + "task_loss": 0.1911116987466812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2921827435493469, + "epoch": 9.33, + "learning_rate": 3.3262890955198645e-06, + "loss": 0.4182, + "step": 11043, + "task_loss": 0.8218912482261658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3119533061981201, + "epoch": 9.34, + "learning_rate": 3.322062552831784e-06, + "loss": 0.3903, + "step": 11044, + "task_loss": 0.11271906644105911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36212167143821716, + "epoch": 9.34, + "learning_rate": 3.3178360101437023e-06, + "loss": 0.3574, + "step": 11045, + "task_loss": 0.40594732761383057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40093764662742615, + "epoch": 9.34, + "learning_rate": 3.3136094674556216e-06, + "loss": 0.3564, + "step": 11046, + "task_loss": 0.7874270677566528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23478049039840698, + "epoch": 9.34, + "learning_rate": 3.30938292476754e-06, + "loss": 0.2518, + "step": 11047, + "task_loss": 0.17699165642261505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30439358949661255, + "epoch": 9.34, + "learning_rate": 3.3051563820794594e-06, + "loss": 0.5237, + "step": 11048, + "task_loss": 1.073488473892212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18234124779701233, + "epoch": 9.34, + "learning_rate": 3.300929839391378e-06, + "loss": 0.2784, + "step": 11049, + "task_loss": 0.3398984968662262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2595115900039673, + "epoch": 9.34, + "learning_rate": 3.2967032967032968e-06, + "loss": 0.3271, + "step": 11050, + "task_loss": 0.6616547107696533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37582555413246155, + "epoch": 9.34, + "learning_rate": 3.2924767540152152e-06, + "loss": 0.3551, + "step": 11051, + "task_loss": 0.3572853207588196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3168129622936249, + "epoch": 9.34, + "learning_rate": 3.2882502113271346e-06, + "loss": 0.3869, + "step": 11052, + "task_loss": 0.4606495201587677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47435352206230164, + "epoch": 9.34, + "learning_rate": 3.284023668639053e-06, + "loss": 0.3695, + "step": 11053, + "task_loss": 0.5183403491973877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26282864809036255, + "epoch": 9.34, + "learning_rate": 3.2797971259509723e-06, + "loss": 0.2758, + "step": 11054, + "task_loss": 0.45322901010513306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.325207382440567, + "epoch": 9.34, + "learning_rate": 3.275570583262891e-06, + "loss": 0.3171, + "step": 11055, + "task_loss": 0.5011888742446899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1975768506526947, + "epoch": 9.35, + "learning_rate": 3.27134404057481e-06, + "loss": 0.3498, + "step": 11056, + "task_loss": 1.1277540922164917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28575214743614197, + "epoch": 9.35, + "learning_rate": 3.2671174978867286e-06, + "loss": 0.3435, + "step": 11057, + "task_loss": 0.5960267186164856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3164902925491333, + "epoch": 9.35, + "learning_rate": 3.2628909551986475e-06, + "loss": 0.2867, + "step": 11058, + "task_loss": 0.17749302089214325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.492826372385025, + "epoch": 9.35, + "learning_rate": 3.258664412510567e-06, + "loss": 0.3331, + "step": 11059, + "task_loss": 0.37923234701156616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4420795440673828, + "epoch": 9.35, + "learning_rate": 3.2544378698224853e-06, + "loss": 0.4406, + "step": 11060, + "task_loss": 0.31104207038879395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36642709374427795, + "epoch": 9.35, + "learning_rate": 3.2502113271344046e-06, + "loss": 0.2988, + "step": 11061, + "task_loss": 0.5089414715766907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2447928935289383, + "epoch": 9.35, + "learning_rate": 3.245984784446323e-06, + "loss": 0.3991, + "step": 11062, + "task_loss": 0.640876829624176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5279614925384521, + "epoch": 9.35, + "learning_rate": 3.2417582417582424e-06, + "loss": 0.4147, + "step": 11063, + "task_loss": 0.42881929874420166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33532777428627014, + "epoch": 9.35, + "learning_rate": 3.237531699070161e-06, + "loss": 0.349, + "step": 11064, + "task_loss": 0.8758630156517029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4493199586868286, + "epoch": 9.35, + "learning_rate": 3.2333051563820797e-06, + "loss": 0.4172, + "step": 11065, + "task_loss": 0.42403489351272583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35518717765808105, + "epoch": 9.35, + "learning_rate": 3.229078613693998e-06, + "loss": 0.4044, + "step": 11066, + "task_loss": 0.36862438917160034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.603822648525238, + "epoch": 9.35, + "learning_rate": 3.2248520710059175e-06, + "loss": 0.4129, + "step": 11067, + "task_loss": 0.713758647441864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22391360998153687, + "epoch": 9.36, + "learning_rate": 3.220625528317836e-06, + "loss": 0.3396, + "step": 11068, + "task_loss": 0.0794229656457901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18486909568309784, + "epoch": 9.36, + "learning_rate": 3.2163989856297553e-06, + "loss": 0.2956, + "step": 11069, + "task_loss": 0.07828221470117569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19117680191993713, + "epoch": 9.36, + "learning_rate": 3.2121724429416738e-06, + "loss": 0.2602, + "step": 11070, + "task_loss": 0.6718312501907349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40417739748954773, + "epoch": 9.36, + "learning_rate": 3.207945900253593e-06, + "loss": 0.3964, + "step": 11071, + "task_loss": 0.718445897102356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34396642446517944, + "epoch": 9.36, + "learning_rate": 3.2037193575655115e-06, + "loss": 0.3633, + "step": 11072, + "task_loss": 1.2145955562591553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2959299683570862, + "epoch": 9.36, + "learning_rate": 3.1994928148774304e-06, + "loss": 0.3395, + "step": 11073, + "task_loss": 0.3921305537223816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2636890113353729, + "epoch": 9.36, + "learning_rate": 3.195266272189349e-06, + "loss": 0.3884, + "step": 11074, + "task_loss": 0.7005857825279236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2128930687904358, + "epoch": 9.36, + "learning_rate": 3.191039729501268e-06, + "loss": 0.3592, + "step": 11075, + "task_loss": 0.2898893356323242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25692427158355713, + "epoch": 9.36, + "learning_rate": 3.1868131868131867e-06, + "loss": 0.3596, + "step": 11076, + "task_loss": 0.5875060558319092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3619915246963501, + "epoch": 9.36, + "learning_rate": 3.182586644125106e-06, + "loss": 0.3039, + "step": 11077, + "task_loss": 0.5276740789413452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24015280604362488, + "epoch": 9.36, + "learning_rate": 3.1783601014370245e-06, + "loss": 0.3402, + "step": 11078, + "task_loss": 0.6459085941314697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22988620400428772, + "epoch": 9.36, + "learning_rate": 3.1741335587489438e-06, + "loss": 0.2984, + "step": 11079, + "task_loss": 0.12321125715970993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6407639384269714, + "epoch": 9.37, + "learning_rate": 3.1699070160608622e-06, + "loss": 0.4384, + "step": 11080, + "task_loss": 1.0596818923950195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3968571722507477, + "epoch": 9.37, + "learning_rate": 3.165680473372781e-06, + "loss": 0.3579, + "step": 11081, + "task_loss": 0.0914900153875351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7607548236846924, + "epoch": 9.37, + "learning_rate": 3.1614539306846996e-06, + "loss": 0.5509, + "step": 11082, + "task_loss": 1.328239917755127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5544617772102356, + "epoch": 9.37, + "learning_rate": 3.157227387996619e-06, + "loss": 0.3957, + "step": 11083, + "task_loss": 0.7410380840301514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3408932089805603, + "epoch": 9.37, + "learning_rate": 3.1530008453085374e-06, + "loss": 0.3907, + "step": 11084, + "task_loss": 0.4041612148284912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5012742877006531, + "epoch": 9.37, + "learning_rate": 3.1487743026204567e-06, + "loss": 0.3645, + "step": 11085, + "task_loss": 0.6453322768211365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.12356162816286087, + "epoch": 9.37, + "learning_rate": 3.144547759932375e-06, + "loss": 0.4553, + "step": 11086, + "task_loss": 0.27337709069252014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29178446531295776, + "epoch": 9.37, + "learning_rate": 3.1403212172442945e-06, + "loss": 0.3669, + "step": 11087, + "task_loss": 0.2888859510421753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3878091871738434, + "epoch": 9.37, + "learning_rate": 3.1360946745562134e-06, + "loss": 0.3989, + "step": 11088, + "task_loss": 0.8689674139022827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24628372490406036, + "epoch": 9.37, + "learning_rate": 3.131868131868132e-06, + "loss": 0.3527, + "step": 11089, + "task_loss": 0.40652742981910706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30798661708831787, + "epoch": 9.37, + "learning_rate": 3.127641589180051e-06, + "loss": 0.4339, + "step": 11090, + "task_loss": 0.5469135046005249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33946582674980164, + "epoch": 9.38, + "learning_rate": 3.1234150464919696e-06, + "loss": 0.3979, + "step": 11091, + "task_loss": 0.595075786113739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28729313611984253, + "epoch": 9.38, + "learning_rate": 3.1191885038038885e-06, + "loss": 0.3503, + "step": 11092, + "task_loss": 0.24499759078025818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42708301544189453, + "epoch": 9.38, + "learning_rate": 3.1149619611158074e-06, + "loss": 0.4107, + "step": 11093, + "task_loss": 0.8493942022323608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5303544998168945, + "epoch": 9.38, + "learning_rate": 3.1107354184277263e-06, + "loss": 0.4103, + "step": 11094, + "task_loss": 1.5267333984375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3869002163410187, + "epoch": 9.38, + "learning_rate": 3.106508875739645e-06, + "loss": 0.3593, + "step": 11095, + "task_loss": 0.8789853453636169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24231404066085815, + "epoch": 9.38, + "learning_rate": 3.1022823330515637e-06, + "loss": 0.3452, + "step": 11096, + "task_loss": 0.5686526298522949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21870648860931396, + "epoch": 9.38, + "learning_rate": 3.0980557903634826e-06, + "loss": 0.2211, + "step": 11097, + "task_loss": 0.29066091775894165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23206254839897156, + "epoch": 9.38, + "learning_rate": 3.0938292476754014e-06, + "loss": 0.4378, + "step": 11098, + "task_loss": 0.5667880773544312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3592981696128845, + "epoch": 9.38, + "learning_rate": 3.0896027049873203e-06, + "loss": 0.4183, + "step": 11099, + "task_loss": 0.4001733660697937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28613173961639404, + "epoch": 9.38, + "learning_rate": 3.0853761622992392e-06, + "loss": 0.3511, + "step": 11100, + "task_loss": 0.4204499423503876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37144899368286133, + "epoch": 9.38, + "learning_rate": 3.081149619611158e-06, + "loss": 0.3554, + "step": 11101, + "task_loss": 1.710484266281128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3413456082344055, + "epoch": 9.38, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.3428, + "step": 11102, + "task_loss": 0.3558732569217682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3178199827671051, + "epoch": 9.39, + "learning_rate": 3.072696534234996e-06, + "loss": 0.4028, + "step": 11103, + "task_loss": 0.38671812415122986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28192704916000366, + "epoch": 9.39, + "learning_rate": 3.068469991546915e-06, + "loss": 0.3574, + "step": 11104, + "task_loss": 1.3155499696731567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3366948068141937, + "epoch": 9.39, + "learning_rate": 3.0642434488588337e-06, + "loss": 0.3821, + "step": 11105, + "task_loss": 0.26136136054992676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24905310571193695, + "epoch": 9.39, + "learning_rate": 3.0600169061707526e-06, + "loss": 0.3619, + "step": 11106, + "task_loss": 1.037542462348938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4351212978363037, + "epoch": 9.39, + "learning_rate": 3.0557903634826715e-06, + "loss": 0.4299, + "step": 11107, + "task_loss": 1.030940055847168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45062148571014404, + "epoch": 9.39, + "learning_rate": 3.0515638207945904e-06, + "loss": 0.4765, + "step": 11108, + "task_loss": 1.3921380043029785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29802122712135315, + "epoch": 9.39, + "learning_rate": 3.0473372781065093e-06, + "loss": 0.3424, + "step": 11109, + "task_loss": 0.6004763841629028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4276250898838043, + "epoch": 9.39, + "learning_rate": 3.043110735418428e-06, + "loss": 0.4188, + "step": 11110, + "task_loss": 0.5475895404815674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3173372745513916, + "epoch": 9.39, + "learning_rate": 3.0388841927303466e-06, + "loss": 0.3275, + "step": 11111, + "task_loss": 0.5360631942749023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4072100520133972, + "epoch": 9.39, + "learning_rate": 3.0346576500422655e-06, + "loss": 0.4034, + "step": 11112, + "task_loss": 1.122248888015747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23554542660713196, + "epoch": 9.39, + "learning_rate": 3.0304311073541844e-06, + "loss": 0.2879, + "step": 11113, + "task_loss": 0.1571430265903473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2392873615026474, + "epoch": 9.39, + "learning_rate": 3.0262045646661033e-06, + "loss": 0.343, + "step": 11114, + "task_loss": 0.7264078855514526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26673221588134766, + "epoch": 9.4, + "learning_rate": 3.021978021978022e-06, + "loss": 0.3701, + "step": 11115, + "task_loss": 0.04119856655597687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1630285382270813, + "epoch": 9.4, + "learning_rate": 3.017751479289941e-06, + "loss": 0.327, + "step": 11116, + "task_loss": 0.5227041244506836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26777705550193787, + "epoch": 9.4, + "learning_rate": 3.01352493660186e-06, + "loss": 0.3397, + "step": 11117, + "task_loss": 0.401448130607605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48817840218544006, + "epoch": 9.4, + "learning_rate": 3.009298393913779e-06, + "loss": 0.3959, + "step": 11118, + "task_loss": 0.2747510075569153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37120527029037476, + "epoch": 9.4, + "learning_rate": 3.0050718512256973e-06, + "loss": 0.4289, + "step": 11119, + "task_loss": 0.373191237449646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18152177333831787, + "epoch": 9.4, + "learning_rate": 3.0008453085376162e-06, + "loss": 0.3544, + "step": 11120, + "task_loss": 0.5906203985214233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3422137498855591, + "epoch": 9.4, + "learning_rate": 2.996618765849535e-06, + "loss": 0.3601, + "step": 11121, + "task_loss": 0.5210201144218445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3814418315887451, + "epoch": 9.4, + "learning_rate": 2.992392223161454e-06, + "loss": 0.4982, + "step": 11122, + "task_loss": 0.36218300461769104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3380776345729828, + "epoch": 9.4, + "learning_rate": 2.988165680473373e-06, + "loss": 0.3983, + "step": 11123, + "task_loss": 0.45964542031288147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22319814562797546, + "epoch": 9.4, + "learning_rate": 2.9839391377852918e-06, + "loss": 0.3362, + "step": 11124, + "task_loss": 0.34649941325187683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5251970291137695, + "epoch": 9.4, + "learning_rate": 2.9797125950972107e-06, + "loss": 0.4764, + "step": 11125, + "task_loss": 0.35213711857795715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36574220657348633, + "epoch": 9.4, + "learning_rate": 2.9754860524091296e-06, + "loss": 0.409, + "step": 11126, + "task_loss": 0.9604750275611877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49621087312698364, + "epoch": 9.41, + "learning_rate": 2.971259509721048e-06, + "loss": 0.3818, + "step": 11127, + "task_loss": 0.5045230388641357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4557150900363922, + "epoch": 9.41, + "learning_rate": 2.967032967032967e-06, + "loss": 0.3404, + "step": 11128, + "task_loss": 0.1360621154308319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.12398593872785568, + "epoch": 9.41, + "learning_rate": 2.962806424344886e-06, + "loss": 0.273, + "step": 11129, + "task_loss": 0.007481226231902838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2632799744606018, + "epoch": 9.41, + "learning_rate": 2.9585798816568047e-06, + "loss": 0.3894, + "step": 11130, + "task_loss": 0.4044606685638428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45583659410476685, + "epoch": 9.41, + "learning_rate": 2.9543533389687236e-06, + "loss": 0.3836, + "step": 11131, + "task_loss": 1.7014849185943604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2961050271987915, + "epoch": 9.41, + "learning_rate": 2.950126796280643e-06, + "loss": 0.3258, + "step": 11132, + "task_loss": 1.3750492334365845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23177145421504974, + "epoch": 9.41, + "learning_rate": 2.9459002535925614e-06, + "loss": 0.2053, + "step": 11133, + "task_loss": 0.27053189277648926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4508607089519501, + "epoch": 9.41, + "learning_rate": 2.9416737109044803e-06, + "loss": 0.3504, + "step": 11134, + "task_loss": 1.0556132793426514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3256028890609741, + "epoch": 9.41, + "learning_rate": 2.937447168216399e-06, + "loss": 0.3676, + "step": 11135, + "task_loss": 0.6541071534156799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3309953212738037, + "epoch": 9.41, + "learning_rate": 2.933220625528318e-06, + "loss": 0.3615, + "step": 11136, + "task_loss": 0.6462727189064026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20459747314453125, + "epoch": 9.41, + "learning_rate": 2.928994082840237e-06, + "loss": 0.2526, + "step": 11137, + "task_loss": 0.4076334834098816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1990799754858017, + "epoch": 9.41, + "learning_rate": 2.924767540152156e-06, + "loss": 0.312, + "step": 11138, + "task_loss": 0.3357452154159546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2570980191230774, + "epoch": 9.42, + "learning_rate": 2.9205409974640747e-06, + "loss": 0.4303, + "step": 11139, + "task_loss": 0.7876008152961731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3088034987449646, + "epoch": 9.42, + "learning_rate": 2.9163144547759936e-06, + "loss": 0.3194, + "step": 11140, + "task_loss": 0.5289038419723511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35589104890823364, + "epoch": 9.42, + "learning_rate": 2.912087912087912e-06, + "loss": 0.4739, + "step": 11141, + "task_loss": 0.40654855966567993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2923547327518463, + "epoch": 9.42, + "learning_rate": 2.907861369399831e-06, + "loss": 0.3831, + "step": 11142, + "task_loss": 0.27605077624320984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43648242950439453, + "epoch": 9.42, + "learning_rate": 2.90363482671175e-06, + "loss": 0.3481, + "step": 11143, + "task_loss": 0.24743427336215973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25060001015663147, + "epoch": 9.42, + "learning_rate": 2.8994082840236688e-06, + "loss": 0.4096, + "step": 11144, + "task_loss": 0.6989130973815918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37293773889541626, + "epoch": 9.42, + "learning_rate": 2.8951817413355877e-06, + "loss": 0.3446, + "step": 11145, + "task_loss": 0.7791764140129089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4002939462661743, + "epoch": 9.42, + "learning_rate": 2.8909551986475065e-06, + "loss": 0.3837, + "step": 11146, + "task_loss": 1.1386535167694092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3432043194770813, + "epoch": 9.42, + "learning_rate": 2.8867286559594254e-06, + "loss": 0.31, + "step": 11147, + "task_loss": 0.3063526451587677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31005948781967163, + "epoch": 9.42, + "learning_rate": 2.8825021132713443e-06, + "loss": 0.369, + "step": 11148, + "task_loss": 0.18603965640068054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38781529664993286, + "epoch": 9.42, + "learning_rate": 2.878275570583263e-06, + "loss": 0.3884, + "step": 11149, + "task_loss": 0.8698360323905945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49873051047325134, + "epoch": 9.42, + "learning_rate": 2.8740490278951817e-06, + "loss": 0.3806, + "step": 11150, + "task_loss": 0.5676694512367249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23108352720737457, + "epoch": 9.43, + "learning_rate": 2.8698224852071006e-06, + "loss": 0.3083, + "step": 11151, + "task_loss": 0.49526727199554443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3350542187690735, + "epoch": 9.43, + "learning_rate": 2.8655959425190195e-06, + "loss": 0.3731, + "step": 11152, + "task_loss": 0.6679126620292664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3656049072742462, + "epoch": 9.43, + "learning_rate": 2.8613693998309384e-06, + "loss": 0.3294, + "step": 11153, + "task_loss": 0.2707204222679138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2533494234085083, + "epoch": 9.43, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.2786, + "step": 11154, + "task_loss": 0.8871655464172363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3235633969306946, + "epoch": 9.43, + "learning_rate": 2.852916314454776e-06, + "loss": 0.36, + "step": 11155, + "task_loss": 0.14663556218147278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3071832060813904, + "epoch": 9.43, + "learning_rate": 2.848689771766695e-06, + "loss": 0.2851, + "step": 11156, + "task_loss": 0.5383762717247009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28878065943717957, + "epoch": 9.43, + "learning_rate": 2.8444632290786135e-06, + "loss": 0.3209, + "step": 11157, + "task_loss": 0.7168927192687988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4282752275466919, + "epoch": 9.43, + "learning_rate": 2.8402366863905324e-06, + "loss": 0.444, + "step": 11158, + "task_loss": 0.6458749175071716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3900154232978821, + "epoch": 9.43, + "learning_rate": 2.8360101437024513e-06, + "loss": 0.4018, + "step": 11159, + "task_loss": 1.3545957803726196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1885564923286438, + "epoch": 9.43, + "learning_rate": 2.83178360101437e-06, + "loss": 0.2824, + "step": 11160, + "task_loss": 0.17722372710704803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.191127747297287, + "epoch": 9.43, + "learning_rate": 2.827557058326289e-06, + "loss": 0.3312, + "step": 11161, + "task_loss": 0.5437976717948914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2990482449531555, + "epoch": 9.44, + "learning_rate": 2.8233305156382084e-06, + "loss": 0.3346, + "step": 11162, + "task_loss": 0.7536722421646118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35889384150505066, + "epoch": 9.44, + "learning_rate": 2.8191039729501273e-06, + "loss": 0.3899, + "step": 11163, + "task_loss": 0.6088002324104309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29134809970855713, + "epoch": 9.44, + "learning_rate": 2.8148774302620457e-06, + "loss": 0.3064, + "step": 11164, + "task_loss": 1.0273517370224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.13257403671741486, + "epoch": 9.44, + "learning_rate": 2.8106508875739646e-06, + "loss": 0.3117, + "step": 11165, + "task_loss": 0.11883401870727539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.406500905752182, + "epoch": 9.44, + "learning_rate": 2.8064243448858835e-06, + "loss": 0.3693, + "step": 11166, + "task_loss": 0.29360702633857727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2889021635055542, + "epoch": 9.44, + "learning_rate": 2.8021978021978024e-06, + "loss": 0.275, + "step": 11167, + "task_loss": 0.1887950599193573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3043598532676697, + "epoch": 9.44, + "learning_rate": 2.7979712595097213e-06, + "loss": 0.3032, + "step": 11168, + "task_loss": 0.999653697013855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3810463547706604, + "epoch": 9.44, + "learning_rate": 2.79374471682164e-06, + "loss": 0.3273, + "step": 11169, + "task_loss": 0.7576808929443359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3075995147228241, + "epoch": 9.44, + "learning_rate": 2.789518174133559e-06, + "loss": 0.505, + "step": 11170, + "task_loss": 0.9707009792327881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2421969771385193, + "epoch": 9.44, + "learning_rate": 2.785291631445478e-06, + "loss": 0.2542, + "step": 11171, + "task_loss": 0.5644077658653259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2443886548280716, + "epoch": 9.44, + "learning_rate": 2.7810650887573965e-06, + "loss": 0.3005, + "step": 11172, + "task_loss": 0.7673459649085999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3950432538986206, + "epoch": 9.44, + "learning_rate": 2.7768385460693153e-06, + "loss": 0.3732, + "step": 11173, + "task_loss": 0.19618147611618042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19636203348636627, + "epoch": 9.45, + "learning_rate": 2.7726120033812342e-06, + "loss": 0.3113, + "step": 11174, + "task_loss": 0.21818137168884277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2605751156806946, + "epoch": 9.45, + "learning_rate": 2.768385460693153e-06, + "loss": 0.3423, + "step": 11175, + "task_loss": 0.8324499726295471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18753059208393097, + "epoch": 9.45, + "learning_rate": 2.764158918005072e-06, + "loss": 0.3201, + "step": 11176, + "task_loss": 0.3602069616317749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5155521035194397, + "epoch": 9.45, + "learning_rate": 2.759932375316991e-06, + "loss": 0.3462, + "step": 11177, + "task_loss": 0.4848915636539459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28499725461006165, + "epoch": 9.45, + "learning_rate": 2.75570583262891e-06, + "loss": 0.3213, + "step": 11178, + "task_loss": 0.24565193057060242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40402278304100037, + "epoch": 9.45, + "learning_rate": 2.7514792899408287e-06, + "loss": 0.3392, + "step": 11179, + "task_loss": 0.8326399922370911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46275094151496887, + "epoch": 9.45, + "learning_rate": 2.747252747252747e-06, + "loss": 0.4173, + "step": 11180, + "task_loss": 0.36680543422698975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24392396211624146, + "epoch": 9.45, + "learning_rate": 2.743026204564666e-06, + "loss": 0.2465, + "step": 11181, + "task_loss": 0.30823227763175964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44473177194595337, + "epoch": 9.45, + "learning_rate": 2.738799661876585e-06, + "loss": 0.4163, + "step": 11182, + "task_loss": 0.3216438591480255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2071896344423294, + "epoch": 9.45, + "learning_rate": 2.734573119188504e-06, + "loss": 0.2682, + "step": 11183, + "task_loss": 0.20310378074645996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3409462571144104, + "epoch": 9.45, + "learning_rate": 2.7303465765004227e-06, + "loss": 0.3783, + "step": 11184, + "task_loss": 0.08326666057109833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3120713233947754, + "epoch": 9.45, + "learning_rate": 2.7261200338123416e-06, + "loss": 0.4099, + "step": 11185, + "task_loss": 0.7058430910110474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.13359495997428894, + "epoch": 9.46, + "learning_rate": 2.7218934911242605e-06, + "loss": 0.2778, + "step": 11186, + "task_loss": 0.027225244790315628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41171905398368835, + "epoch": 9.46, + "learning_rate": 2.7176669484361794e-06, + "loss": 0.3132, + "step": 11187, + "task_loss": 0.622369110584259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3047202229499817, + "epoch": 9.46, + "learning_rate": 2.713440405748098e-06, + "loss": 0.3383, + "step": 11188, + "task_loss": 0.9664789438247681 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21926406025886536, + "epoch": 9.46, + "learning_rate": 2.7092138630600168e-06, + "loss": 0.2802, + "step": 11189, + "task_loss": 1.006090521812439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3817671835422516, + "epoch": 9.46, + "learning_rate": 2.7049873203719357e-06, + "loss": 0.3014, + "step": 11190, + "task_loss": 1.0172269344329834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4449467062950134, + "epoch": 9.46, + "learning_rate": 2.700760777683855e-06, + "loss": 0.3543, + "step": 11191, + "task_loss": 0.3825158178806305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4292738735675812, + "epoch": 9.46, + "learning_rate": 2.696534234995774e-06, + "loss": 0.4496, + "step": 11192, + "task_loss": 0.8003434538841248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24324774742126465, + "epoch": 9.46, + "learning_rate": 2.6923076923076928e-06, + "loss": 0.3683, + "step": 11193, + "task_loss": 0.27464956045150757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.394970566034317, + "epoch": 9.46, + "learning_rate": 2.6880811496196112e-06, + "loss": 0.328, + "step": 11194, + "task_loss": 1.0642701387405396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4030565917491913, + "epoch": 9.46, + "learning_rate": 2.68385460693153e-06, + "loss": 0.2957, + "step": 11195, + "task_loss": 1.3650078773498535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17747089266777039, + "epoch": 9.46, + "learning_rate": 2.679628064243449e-06, + "loss": 0.3279, + "step": 11196, + "task_loss": 0.3820453882217407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4664587080478668, + "epoch": 9.46, + "learning_rate": 2.675401521555368e-06, + "loss": 0.3248, + "step": 11197, + "task_loss": 0.48224613070487976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2361564338207245, + "epoch": 9.47, + "learning_rate": 2.671174978867287e-06, + "loss": 0.2622, + "step": 11198, + "task_loss": 0.25270190834999084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3353045582771301, + "epoch": 9.47, + "learning_rate": 2.6669484361792057e-06, + "loss": 0.318, + "step": 11199, + "task_loss": 0.4183235764503479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.262408584356308, + "epoch": 9.47, + "learning_rate": 2.6627218934911246e-06, + "loss": 0.3502, + "step": 11200, + "task_loss": 0.6557337045669556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35873621702194214, + "epoch": 9.47, + "learning_rate": 2.6584953508030435e-06, + "loss": 0.3308, + "step": 11201, + "task_loss": 0.6011322736740112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34558629989624023, + "epoch": 9.47, + "learning_rate": 2.654268808114962e-06, + "loss": 0.3915, + "step": 11202, + "task_loss": 0.04240553081035614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4079582989215851, + "epoch": 9.47, + "learning_rate": 2.650042265426881e-06, + "loss": 0.3423, + "step": 11203, + "task_loss": 0.8091160655021667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3989418148994446, + "epoch": 9.47, + "learning_rate": 2.6458157227387997e-06, + "loss": 0.3648, + "step": 11204, + "task_loss": 1.2211722135543823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22336331009864807, + "epoch": 9.47, + "learning_rate": 2.6415891800507186e-06, + "loss": 0.3255, + "step": 11205, + "task_loss": 0.15671966969966888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22267872095108032, + "epoch": 9.47, + "learning_rate": 2.6373626373626375e-06, + "loss": 0.2524, + "step": 11206, + "task_loss": 0.28931641578674316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2927022874355316, + "epoch": 9.47, + "learning_rate": 2.6331360946745564e-06, + "loss": 0.3034, + "step": 11207, + "task_loss": 0.9644102454185486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4131059944629669, + "epoch": 9.47, + "learning_rate": 2.6289095519864753e-06, + "loss": 0.3307, + "step": 11208, + "task_loss": 0.6778172254562378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3743073046207428, + "epoch": 9.47, + "learning_rate": 2.624683009298394e-06, + "loss": 0.3455, + "step": 11209, + "task_loss": 0.5322973728179932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3584333658218384, + "epoch": 9.48, + "learning_rate": 2.6204564666103126e-06, + "loss": 0.3204, + "step": 11210, + "task_loss": 1.0458983182907104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20761582255363464, + "epoch": 9.48, + "learning_rate": 2.6162299239222315e-06, + "loss": 0.3786, + "step": 11211, + "task_loss": 0.045559678226709366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36495184898376465, + "epoch": 9.48, + "learning_rate": 2.6120033812341504e-06, + "loss": 0.3919, + "step": 11212, + "task_loss": 0.5553227663040161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5865755677223206, + "epoch": 9.48, + "learning_rate": 2.6077768385460693e-06, + "loss": 0.4001, + "step": 11213, + "task_loss": 0.8763982057571411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48601412773132324, + "epoch": 9.48, + "learning_rate": 2.603550295857988e-06, + "loss": 0.4276, + "step": 11214, + "task_loss": 0.4793255627155304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.203613743185997, + "epoch": 9.48, + "learning_rate": 2.599323753169907e-06, + "loss": 0.3254, + "step": 11215, + "task_loss": 0.2487459033727646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49443572759628296, + "epoch": 9.48, + "learning_rate": 2.595097210481826e-06, + "loss": 0.4331, + "step": 11216, + "task_loss": 1.4443079233169556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42447954416275024, + "epoch": 9.48, + "learning_rate": 2.590870667793745e-06, + "loss": 0.3893, + "step": 11217, + "task_loss": 0.07932371646165848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3816227316856384, + "epoch": 9.48, + "learning_rate": 2.5866441251056634e-06, + "loss": 0.3588, + "step": 11218, + "task_loss": 0.41712573170661926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.586235523223877, + "epoch": 9.48, + "learning_rate": 2.5824175824175822e-06, + "loss": 0.4347, + "step": 11219, + "task_loss": 0.8165228962898254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5157811641693115, + "epoch": 9.48, + "learning_rate": 2.578191039729501e-06, + "loss": 0.3712, + "step": 11220, + "task_loss": 0.5261442065238953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6986539363861084, + "epoch": 9.48, + "learning_rate": 2.5739644970414204e-06, + "loss": 0.5047, + "step": 11221, + "task_loss": 1.1466091871261597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23622068762779236, + "epoch": 9.49, + "learning_rate": 2.5697379543533393e-06, + "loss": 0.3423, + "step": 11222, + "task_loss": 0.3911498188972473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21634319424629211, + "epoch": 9.49, + "learning_rate": 2.5655114116652582e-06, + "loss": 0.3416, + "step": 11223, + "task_loss": 0.07129547744989395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4544490575790405, + "epoch": 9.49, + "learning_rate": 2.561284868977177e-06, + "loss": 0.4354, + "step": 11224, + "task_loss": 1.3349862098693848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22212593257427216, + "epoch": 9.49, + "learning_rate": 2.5570583262890956e-06, + "loss": 0.3817, + "step": 11225, + "task_loss": 0.2652691602706909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37907928228378296, + "epoch": 9.49, + "learning_rate": 2.5528317836010145e-06, + "loss": 0.3415, + "step": 11226, + "task_loss": 0.8815335035324097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39877671003341675, + "epoch": 9.49, + "learning_rate": 2.5486052409129334e-06, + "loss": 0.3113, + "step": 11227, + "task_loss": 0.6111629009246826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3368489146232605, + "epoch": 9.49, + "learning_rate": 2.5443786982248523e-06, + "loss": 0.2438, + "step": 11228, + "task_loss": 0.23493146896362305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21482357382774353, + "epoch": 9.49, + "learning_rate": 2.540152155536771e-06, + "loss": 0.3796, + "step": 11229, + "task_loss": 0.4439648389816284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2962871193885803, + "epoch": 9.49, + "learning_rate": 2.53592561284869e-06, + "loss": 0.4066, + "step": 11230, + "task_loss": 0.3677087426185608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23566138744354248, + "epoch": 9.49, + "learning_rate": 2.531699070160609e-06, + "loss": 0.3729, + "step": 11231, + "task_loss": 0.14846713840961456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3625061511993408, + "epoch": 9.49, + "learning_rate": 2.527472527472528e-06, + "loss": 0.4053, + "step": 11232, + "task_loss": 0.7605141997337341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47550538182258606, + "epoch": 9.5, + "learning_rate": 2.5232459847844463e-06, + "loss": 0.3482, + "step": 11233, + "task_loss": 0.5712429881095886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5889968872070312, + "epoch": 9.5, + "learning_rate": 2.519019442096365e-06, + "loss": 0.4103, + "step": 11234, + "task_loss": 0.6373184323310852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.306999146938324, + "epoch": 9.5, + "learning_rate": 2.514792899408284e-06, + "loss": 0.2974, + "step": 11235, + "task_loss": 1.1315233707427979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47511088848114014, + "epoch": 9.5, + "learning_rate": 2.510566356720203e-06, + "loss": 0.3017, + "step": 11236, + "task_loss": 0.5258234739303589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23093372583389282, + "epoch": 9.5, + "learning_rate": 2.506339814032122e-06, + "loss": 0.2415, + "step": 11237, + "task_loss": 0.1555791199207306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49257826805114746, + "epoch": 9.5, + "learning_rate": 2.5021132713440408e-06, + "loss": 0.2845, + "step": 11238, + "task_loss": 0.9504099488258362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2972797453403473, + "epoch": 9.5, + "learning_rate": 2.4978867286559597e-06, + "loss": 0.2845, + "step": 11239, + "task_loss": 0.7512521743774414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40362173318862915, + "epoch": 9.5, + "learning_rate": 2.4936601859678785e-06, + "loss": 0.4071, + "step": 11240, + "task_loss": 0.7546878457069397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.645882248878479, + "epoch": 9.5, + "learning_rate": 2.489433643279797e-06, + "loss": 0.3855, + "step": 11241, + "task_loss": 0.7352324724197388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26476460695266724, + "epoch": 9.5, + "learning_rate": 2.485207100591716e-06, + "loss": 0.3099, + "step": 11242, + "task_loss": 0.38644272089004517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48864278197288513, + "epoch": 9.5, + "learning_rate": 2.480980557903635e-06, + "loss": 0.3817, + "step": 11243, + "task_loss": 1.1041029691696167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3022564947605133, + "epoch": 9.5, + "learning_rate": 2.4767540152155537e-06, + "loss": 0.2858, + "step": 11244, + "task_loss": 0.3329137861728668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24785397946834564, + "epoch": 9.51, + "learning_rate": 2.4725274725274726e-06, + "loss": 0.2989, + "step": 11245, + "task_loss": 0.32285284996032715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32519960403442383, + "epoch": 9.51, + "learning_rate": 2.4683009298393915e-06, + "loss": 0.2943, + "step": 11246, + "task_loss": 0.24122153222560883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2732008695602417, + "epoch": 9.51, + "learning_rate": 2.4640743871513104e-06, + "loss": 0.3719, + "step": 11247, + "task_loss": 0.7145626544952393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6260085105895996, + "epoch": 9.51, + "learning_rate": 2.4598478444632293e-06, + "loss": 0.3501, + "step": 11248, + "task_loss": 0.9510791301727295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27141547203063965, + "epoch": 9.51, + "learning_rate": 2.4556213017751477e-06, + "loss": 0.3061, + "step": 11249, + "task_loss": 0.13078004121780396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48791682720184326, + "epoch": 9.51, + "learning_rate": 2.4513947590870666e-06, + "loss": 0.3337, + "step": 11250, + "task_loss": 0.456876277923584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3203643262386322, + "epoch": 9.51, + "learning_rate": 2.447168216398986e-06, + "loss": 0.4589, + "step": 11251, + "task_loss": 0.9103769659996033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37787556648254395, + "epoch": 9.51, + "learning_rate": 2.442941673710905e-06, + "loss": 0.3822, + "step": 11252, + "task_loss": 0.44761815667152405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34854191541671753, + "epoch": 9.51, + "learning_rate": 2.4387151310228237e-06, + "loss": 0.3136, + "step": 11253, + "task_loss": 0.5957052707672119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26073092222213745, + "epoch": 9.51, + "learning_rate": 2.4344885883347426e-06, + "loss": 0.3113, + "step": 11254, + "task_loss": 0.5697309374809265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1848655641078949, + "epoch": 9.51, + "learning_rate": 2.430262045646661e-06, + "loss": 0.3014, + "step": 11255, + "task_loss": 0.7200050950050354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17946857213974, + "epoch": 9.51, + "learning_rate": 2.42603550295858e-06, + "loss": 0.3607, + "step": 11256, + "task_loss": 0.3222772479057312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.247155100107193, + "epoch": 9.52, + "learning_rate": 2.421808960270499e-06, + "loss": 0.3102, + "step": 11257, + "task_loss": 0.11182907968759537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3045231103897095, + "epoch": 9.52, + "learning_rate": 2.4175824175824177e-06, + "loss": 0.4454, + "step": 11258, + "task_loss": 0.39615824818611145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3244926929473877, + "epoch": 9.52, + "learning_rate": 2.4133558748943366e-06, + "loss": 0.3419, + "step": 11259, + "task_loss": 1.0087002515792847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1687776744365692, + "epoch": 9.52, + "learning_rate": 2.4091293322062555e-06, + "loss": 0.2594, + "step": 11260, + "task_loss": 0.15563015639781952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30812138319015503, + "epoch": 9.52, + "learning_rate": 2.4049027895181744e-06, + "loss": 0.3729, + "step": 11261, + "task_loss": 0.5975702404975891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3309425711631775, + "epoch": 9.52, + "learning_rate": 2.4006762468300933e-06, + "loss": 0.3504, + "step": 11262, + "task_loss": 0.28101032972335815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2739301323890686, + "epoch": 9.52, + "learning_rate": 2.3964497041420118e-06, + "loss": 0.3313, + "step": 11263, + "task_loss": 0.7410649061203003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2618538737297058, + "epoch": 9.52, + "learning_rate": 2.3922231614539307e-06, + "loss": 0.3267, + "step": 11264, + "task_loss": 0.4619693160057068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30512380599975586, + "epoch": 9.52, + "learning_rate": 2.3879966187658496e-06, + "loss": 0.3803, + "step": 11265, + "task_loss": 0.4550963044166565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6370394825935364, + "epoch": 9.52, + "learning_rate": 2.3837700760777685e-06, + "loss": 0.4296, + "step": 11266, + "task_loss": 1.361843466758728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3523854613304138, + "epoch": 9.52, + "learning_rate": 2.3795435333896873e-06, + "loss": 0.2812, + "step": 11267, + "task_loss": 0.23397944867610931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34427791833877563, + "epoch": 9.52, + "learning_rate": 2.3753169907016062e-06, + "loss": 0.2746, + "step": 11268, + "task_loss": 0.3872188329696655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27786725759506226, + "epoch": 9.53, + "learning_rate": 2.371090448013525e-06, + "loss": 0.2863, + "step": 11269, + "task_loss": 0.4174913763999939 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29992711544036865, + "epoch": 9.53, + "learning_rate": 2.366863905325444e-06, + "loss": 0.2748, + "step": 11270, + "task_loss": 0.2696077525615692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30999353528022766, + "epoch": 9.53, + "learning_rate": 2.3626373626373625e-06, + "loss": 0.3082, + "step": 11271, + "task_loss": 0.9265884160995483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2943624258041382, + "epoch": 9.53, + "learning_rate": 2.3584108199492814e-06, + "loss": 0.2983, + "step": 11272, + "task_loss": 0.631165087223053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3417455852031708, + "epoch": 9.53, + "learning_rate": 2.3541842772612003e-06, + "loss": 0.2506, + "step": 11273, + "task_loss": 0.9288751482963562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39671215415000916, + "epoch": 9.53, + "learning_rate": 2.349957734573119e-06, + "loss": 0.3405, + "step": 11274, + "task_loss": 0.7747163772583008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3482685387134552, + "epoch": 9.53, + "learning_rate": 2.345731191885038e-06, + "loss": 0.5447, + "step": 11275, + "task_loss": 0.7401619553565979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.435600608587265, + "epoch": 9.53, + "learning_rate": 2.341504649196957e-06, + "loss": 0.356, + "step": 11276, + "task_loss": 0.16817057132720947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2593138813972473, + "epoch": 9.53, + "learning_rate": 2.337278106508876e-06, + "loss": 0.3154, + "step": 11277, + "task_loss": 0.4653940796852112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3357090353965759, + "epoch": 9.53, + "learning_rate": 2.3330515638207947e-06, + "loss": 0.3413, + "step": 11278, + "task_loss": 1.2139111757278442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.257800817489624, + "epoch": 9.53, + "learning_rate": 2.328825021132713e-06, + "loss": 0.2732, + "step": 11279, + "task_loss": 0.5855380892753601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31374263763427734, + "epoch": 9.53, + "learning_rate": 2.324598478444632e-06, + "loss": 0.3099, + "step": 11280, + "task_loss": 0.155843123793602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4167914390563965, + "epoch": 9.54, + "learning_rate": 2.3203719357565514e-06, + "loss": 0.3793, + "step": 11281, + "task_loss": 1.9477704763412476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4643925428390503, + "epoch": 9.54, + "learning_rate": 2.3161453930684703e-06, + "loss": 0.4343, + "step": 11282, + "task_loss": 0.116689532995224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3383294939994812, + "epoch": 9.54, + "learning_rate": 2.311918850380389e-06, + "loss": 0.364, + "step": 11283, + "task_loss": 1.1284469366073608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22616912424564362, + "epoch": 9.54, + "learning_rate": 2.307692307692308e-06, + "loss": 0.3293, + "step": 11284, + "task_loss": 0.3918701410293579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4249834716320038, + "epoch": 9.54, + "learning_rate": 2.303465765004227e-06, + "loss": 0.4101, + "step": 11285, + "task_loss": 1.1767102479934692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.334256649017334, + "epoch": 9.54, + "learning_rate": 2.2992392223161454e-06, + "loss": 0.421, + "step": 11286, + "task_loss": 0.5789220929145813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3614867925643921, + "epoch": 9.54, + "learning_rate": 2.2950126796280643e-06, + "loss": 0.4794, + "step": 11287, + "task_loss": 0.5564362406730652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6991974711418152, + "epoch": 9.54, + "learning_rate": 2.2907861369399832e-06, + "loss": 0.4562, + "step": 11288, + "task_loss": 1.6002708673477173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3912792503833771, + "epoch": 9.54, + "learning_rate": 2.286559594251902e-06, + "loss": 0.4135, + "step": 11289, + "task_loss": 0.6200382113456726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3674837350845337, + "epoch": 9.54, + "learning_rate": 2.282333051563821e-06, + "loss": 0.3653, + "step": 11290, + "task_loss": 0.25311601161956787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21538691222667694, + "epoch": 9.54, + "learning_rate": 2.27810650887574e-06, + "loss": 0.3212, + "step": 11291, + "task_loss": 0.2719196677207947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3568977117538452, + "epoch": 9.54, + "learning_rate": 2.2738799661876588e-06, + "loss": 0.394, + "step": 11292, + "task_loss": 0.46265581250190735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3504661023616791, + "epoch": 9.55, + "learning_rate": 2.2696534234995777e-06, + "loss": 0.2946, + "step": 11293, + "task_loss": 0.7050178647041321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22982819378376007, + "epoch": 9.55, + "learning_rate": 2.265426880811496e-06, + "loss": 0.3634, + "step": 11294, + "task_loss": 0.3301703631877899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35973668098449707, + "epoch": 9.55, + "learning_rate": 2.261200338123415e-06, + "loss": 0.3474, + "step": 11295, + "task_loss": 0.8677868247032166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36139997839927673, + "epoch": 9.55, + "learning_rate": 2.256973795435334e-06, + "loss": 0.379, + "step": 11296, + "task_loss": 0.9272249937057495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45224395394325256, + "epoch": 9.55, + "learning_rate": 2.252747252747253e-06, + "loss": 0.3255, + "step": 11297, + "task_loss": 0.23578180372714996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.261436402797699, + "epoch": 9.55, + "learning_rate": 2.2485207100591717e-06, + "loss": 0.4009, + "step": 11298, + "task_loss": 1.0435104370117188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16719746589660645, + "epoch": 9.55, + "learning_rate": 2.2442941673710906e-06, + "loss": 0.281, + "step": 11299, + "task_loss": 0.2571280002593994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46074315905570984, + "epoch": 9.55, + "learning_rate": 2.2400676246830095e-06, + "loss": 0.3586, + "step": 11300, + "task_loss": 0.6485046744346619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29971182346343994, + "epoch": 9.55, + "learning_rate": 2.2358410819949284e-06, + "loss": 0.3926, + "step": 11301, + "task_loss": 0.3752540647983551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2050643414258957, + "epoch": 9.55, + "learning_rate": 2.231614539306847e-06, + "loss": 0.3566, + "step": 11302, + "task_loss": 0.6496607661247253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37660926580429077, + "epoch": 9.55, + "learning_rate": 2.2273879966187657e-06, + "loss": 0.4935, + "step": 11303, + "task_loss": 0.7593499422073364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39057713747024536, + "epoch": 9.56, + "learning_rate": 2.2231614539306846e-06, + "loss": 0.454, + "step": 11304, + "task_loss": 0.7025313377380371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2646704614162445, + "epoch": 9.56, + "learning_rate": 2.2189349112426035e-06, + "loss": 0.3395, + "step": 11305, + "task_loss": 0.5825679302215576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21712934970855713, + "epoch": 9.56, + "learning_rate": 2.2147083685545224e-06, + "loss": 0.3976, + "step": 11306, + "task_loss": 0.44474098086357117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6768749952316284, + "epoch": 9.56, + "learning_rate": 2.2104818258664413e-06, + "loss": 0.4045, + "step": 11307, + "task_loss": 0.489572674036026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3789549767971039, + "epoch": 9.56, + "learning_rate": 2.20625528317836e-06, + "loss": 0.3003, + "step": 11308, + "task_loss": 0.43537437915802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5831117630004883, + "epoch": 9.56, + "learning_rate": 2.202028740490279e-06, + "loss": 0.4524, + "step": 11309, + "task_loss": 1.1619058847427368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33674752712249756, + "epoch": 9.56, + "learning_rate": 2.197802197802198e-06, + "loss": 0.335, + "step": 11310, + "task_loss": 0.5147825479507446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4123269021511078, + "epoch": 9.56, + "learning_rate": 2.193575655114117e-06, + "loss": 0.4734, + "step": 11311, + "task_loss": 0.5781444907188416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.345676988363266, + "epoch": 9.56, + "learning_rate": 2.1893491124260358e-06, + "loss": 0.3586, + "step": 11312, + "task_loss": 0.6409375071525574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2583978772163391, + "epoch": 9.56, + "learning_rate": 2.1851225697379547e-06, + "loss": 0.3008, + "step": 11313, + "task_loss": 0.7200506925582886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28517550230026245, + "epoch": 9.56, + "learning_rate": 2.1808960270498736e-06, + "loss": 0.3608, + "step": 11314, + "task_loss": 1.1121327877044678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5476095676422119, + "epoch": 9.56, + "learning_rate": 2.1766694843617924e-06, + "loss": 0.519, + "step": 11315, + "task_loss": 0.5626525282859802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2948870062828064, + "epoch": 9.57, + "learning_rate": 2.172442941673711e-06, + "loss": 0.2944, + "step": 11316, + "task_loss": 0.10477277636528015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3104032874107361, + "epoch": 9.57, + "learning_rate": 2.16821639898563e-06, + "loss": 0.2905, + "step": 11317, + "task_loss": 0.12572573125362396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30485475063323975, + "epoch": 9.57, + "learning_rate": 2.1639898562975487e-06, + "loss": 0.2992, + "step": 11318, + "task_loss": 0.7191717624664307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48069918155670166, + "epoch": 9.57, + "learning_rate": 2.1597633136094676e-06, + "loss": 0.3559, + "step": 11319, + "task_loss": 1.0460033416748047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24722042679786682, + "epoch": 9.57, + "learning_rate": 2.1555367709213865e-06, + "loss": 0.3397, + "step": 11320, + "task_loss": 0.5406808853149414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42122042179107666, + "epoch": 9.57, + "learning_rate": 2.1513102282333054e-06, + "loss": 0.4235, + "step": 11321, + "task_loss": 0.492661714553833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4442176818847656, + "epoch": 9.57, + "learning_rate": 2.1470836855452243e-06, + "loss": 0.3109, + "step": 11322, + "task_loss": 0.5840321183204651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39093416929244995, + "epoch": 9.57, + "learning_rate": 2.142857142857143e-06, + "loss": 0.3028, + "step": 11323, + "task_loss": 0.42109596729278564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3499734699726105, + "epoch": 9.57, + "learning_rate": 2.1386306001690616e-06, + "loss": 0.4729, + "step": 11324, + "task_loss": 0.11740142107009888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3950878381729126, + "epoch": 9.57, + "learning_rate": 2.1344040574809805e-06, + "loss": 0.2758, + "step": 11325, + "task_loss": 0.5653151273727417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46087199449539185, + "epoch": 9.57, + "learning_rate": 2.1301775147928994e-06, + "loss": 0.4125, + "step": 11326, + "task_loss": 0.5265005826950073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3211498558521271, + "epoch": 9.57, + "learning_rate": 2.1259509721048183e-06, + "loss": 0.3471, + "step": 11327, + "task_loss": 0.18026238679885864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37499815225601196, + "epoch": 9.58, + "learning_rate": 2.121724429416737e-06, + "loss": 0.4197, + "step": 11328, + "task_loss": 0.6027863621711731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27312034368515015, + "epoch": 9.58, + "learning_rate": 2.117497886728656e-06, + "loss": 0.2502, + "step": 11329, + "task_loss": 0.3918531537055969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3683964014053345, + "epoch": 9.58, + "learning_rate": 2.113271344040575e-06, + "loss": 0.3688, + "step": 11330, + "task_loss": 0.3580619990825653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40159040689468384, + "epoch": 9.58, + "learning_rate": 2.109044801352494e-06, + "loss": 0.3839, + "step": 11331, + "task_loss": 0.7435581684112549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22182714939117432, + "epoch": 9.58, + "learning_rate": 2.1048182586644123e-06, + "loss": 0.2732, + "step": 11332, + "task_loss": 0.2184019684791565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2464931458234787, + "epoch": 9.58, + "learning_rate": 2.1005917159763312e-06, + "loss": 0.3398, + "step": 11333, + "task_loss": 0.44217413663864136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25661230087280273, + "epoch": 9.58, + "learning_rate": 2.09636517328825e-06, + "loss": 0.2607, + "step": 11334, + "task_loss": 0.12492145597934723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2948495149612427, + "epoch": 9.58, + "learning_rate": 2.092138630600169e-06, + "loss": 0.3299, + "step": 11335, + "task_loss": 0.5423761606216431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5118556022644043, + "epoch": 9.58, + "learning_rate": 2.087912087912088e-06, + "loss": 0.44, + "step": 11336, + "task_loss": 1.2480664253234863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1786273568868637, + "epoch": 9.58, + "learning_rate": 2.083685545224007e-06, + "loss": 0.3445, + "step": 11337, + "task_loss": 0.2532385289669037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2957235276699066, + "epoch": 9.58, + "learning_rate": 2.0794590025359257e-06, + "loss": 0.3472, + "step": 11338, + "task_loss": 1.193177580833435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32421043515205383, + "epoch": 9.58, + "learning_rate": 2.0752324598478446e-06, + "loss": 0.3456, + "step": 11339, + "task_loss": 0.6723638772964478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7096904516220093, + "epoch": 9.59, + "learning_rate": 2.0710059171597635e-06, + "loss": 0.5919, + "step": 11340, + "task_loss": 1.3342965841293335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22693704068660736, + "epoch": 9.59, + "learning_rate": 2.0667793744716824e-06, + "loss": 0.4114, + "step": 11341, + "task_loss": 0.17792680859565735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2959129214286804, + "epoch": 9.59, + "learning_rate": 2.0625528317836012e-06, + "loss": 0.4811, + "step": 11342, + "task_loss": 0.21198199689388275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19234037399291992, + "epoch": 9.59, + "learning_rate": 2.05832628909552e-06, + "loss": 0.3488, + "step": 11343, + "task_loss": 0.10808181017637253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27044084668159485, + "epoch": 9.59, + "learning_rate": 2.054099746407439e-06, + "loss": 0.3654, + "step": 11344, + "task_loss": 1.0489733219146729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3765343725681305, + "epoch": 9.59, + "learning_rate": 2.049873203719358e-06, + "loss": 0.3137, + "step": 11345, + "task_loss": 0.9310837388038635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38879409432411194, + "epoch": 9.59, + "learning_rate": 2.045646661031277e-06, + "loss": 0.4871, + "step": 11346, + "task_loss": 1.2024022340774536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2554311454296112, + "epoch": 9.59, + "learning_rate": 2.0414201183431953e-06, + "loss": 0.3434, + "step": 11347, + "task_loss": 0.5166749358177185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5394352078437805, + "epoch": 9.59, + "learning_rate": 2.037193575655114e-06, + "loss": 0.3726, + "step": 11348, + "task_loss": 1.0180165767669678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18100568652153015, + "epoch": 9.59, + "learning_rate": 2.032967032967033e-06, + "loss": 0.3092, + "step": 11349, + "task_loss": 0.23527662456035614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35804274678230286, + "epoch": 9.59, + "learning_rate": 2.028740490278952e-06, + "loss": 0.3171, + "step": 11350, + "task_loss": 1.0076273679733276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21199451386928558, + "epoch": 9.59, + "learning_rate": 2.024513947590871e-06, + "loss": 0.3179, + "step": 11351, + "task_loss": 0.05430179089307785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39287057518959045, + "epoch": 9.6, + "learning_rate": 2.0202874049027897e-06, + "loss": 0.4013, + "step": 11352, + "task_loss": 0.37391263246536255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3621377646923065, + "epoch": 9.6, + "learning_rate": 2.0160608622147086e-06, + "loss": 0.4331, + "step": 11353, + "task_loss": 0.31450536847114563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2855967581272125, + "epoch": 9.6, + "learning_rate": 2.0118343195266275e-06, + "loss": 0.3319, + "step": 11354, + "task_loss": 0.7733631134033203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3082992434501648, + "epoch": 9.6, + "learning_rate": 2.007607776838546e-06, + "loss": 0.3981, + "step": 11355, + "task_loss": 1.32911217212677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3519038259983063, + "epoch": 9.6, + "learning_rate": 2.003381234150465e-06, + "loss": 0.3478, + "step": 11356, + "task_loss": 0.7286037802696228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2534392774105072, + "epoch": 9.6, + "learning_rate": 1.9991546914623838e-06, + "loss": 0.2666, + "step": 11357, + "task_loss": 0.22844351828098297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4133428931236267, + "epoch": 9.6, + "learning_rate": 1.9949281487743027e-06, + "loss": 0.4222, + "step": 11358, + "task_loss": 1.2354899644851685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42720797657966614, + "epoch": 9.6, + "learning_rate": 1.9907016060862216e-06, + "loss": 0.3185, + "step": 11359, + "task_loss": 0.29582083225250244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30216288566589355, + "epoch": 9.6, + "learning_rate": 1.9864750633981404e-06, + "loss": 0.3569, + "step": 11360, + "task_loss": 0.7684584856033325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19312812387943268, + "epoch": 9.6, + "learning_rate": 1.9822485207100593e-06, + "loss": 0.2425, + "step": 11361, + "task_loss": 0.13954511284828186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24762722849845886, + "epoch": 9.6, + "learning_rate": 1.9780219780219782e-06, + "loss": 0.3128, + "step": 11362, + "task_loss": 1.152559518814087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2648628056049347, + "epoch": 9.6, + "learning_rate": 1.9737954353338967e-06, + "loss": 0.3865, + "step": 11363, + "task_loss": 0.4906817674636841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5737491250038147, + "epoch": 9.61, + "learning_rate": 1.9695688926458156e-06, + "loss": 0.3822, + "step": 11364, + "task_loss": 0.632621705532074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3169782757759094, + "epoch": 9.61, + "learning_rate": 1.9653423499577345e-06, + "loss": 0.3805, + "step": 11365, + "task_loss": 0.5344615578651428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2742988169193268, + "epoch": 9.61, + "learning_rate": 1.9611158072696534e-06, + "loss": 0.3415, + "step": 11366, + "task_loss": 0.5961724519729614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24963140487670898, + "epoch": 9.61, + "learning_rate": 1.9568892645815723e-06, + "loss": 0.2595, + "step": 11367, + "task_loss": 0.23455478250980377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23349100351333618, + "epoch": 9.61, + "learning_rate": 1.952662721893491e-06, + "loss": 0.3106, + "step": 11368, + "task_loss": 0.4577588438987732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2879914939403534, + "epoch": 9.61, + "learning_rate": 1.94843617920541e-06, + "loss": 0.3557, + "step": 11369, + "task_loss": 0.9814213514328003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19712060689926147, + "epoch": 9.61, + "learning_rate": 1.944209636517329e-06, + "loss": 0.3086, + "step": 11370, + "task_loss": 0.039380405098199844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2103244960308075, + "epoch": 9.61, + "learning_rate": 1.939983093829248e-06, + "loss": 0.3082, + "step": 11371, + "task_loss": 0.018880458548665047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5503775477409363, + "epoch": 9.61, + "learning_rate": 1.9357565511411667e-06, + "loss": 0.5378, + "step": 11372, + "task_loss": 0.29670798778533936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21757550537586212, + "epoch": 9.61, + "learning_rate": 1.9315300084530856e-06, + "loss": 0.277, + "step": 11373, + "task_loss": 0.5100223422050476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34573304653167725, + "epoch": 9.61, + "learning_rate": 1.9273034657650045e-06, + "loss": 0.3425, + "step": 11374, + "task_loss": 0.46197354793548584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3948034644126892, + "epoch": 9.61, + "learning_rate": 1.9230769230769234e-06, + "loss": 0.4335, + "step": 11375, + "task_loss": 0.960390031337738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47717419266700745, + "epoch": 9.62, + "learning_rate": 1.9188503803888423e-06, + "loss": 0.3592, + "step": 11376, + "task_loss": 0.6879274845123291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21797139942646027, + "epoch": 9.62, + "learning_rate": 1.9146238377007608e-06, + "loss": 0.2583, + "step": 11377, + "task_loss": 0.10829822719097137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21399691700935364, + "epoch": 9.62, + "learning_rate": 1.9103972950126796e-06, + "loss": 0.2461, + "step": 11378, + "task_loss": 0.18880638480186462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28747397661209106, + "epoch": 9.62, + "learning_rate": 1.9061707523245985e-06, + "loss": 0.29, + "step": 11379, + "task_loss": 0.24079172313213348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30355891585350037, + "epoch": 9.62, + "learning_rate": 1.9019442096365174e-06, + "loss": 0.3047, + "step": 11380, + "task_loss": 0.32455843687057495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.452084481716156, + "epoch": 9.62, + "learning_rate": 1.8977176669484363e-06, + "loss": 0.3538, + "step": 11381, + "task_loss": 0.7939251065254211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3114120364189148, + "epoch": 9.62, + "learning_rate": 1.8934911242603552e-06, + "loss": 0.3864, + "step": 11382, + "task_loss": 0.4872196912765503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.423596054315567, + "epoch": 9.62, + "learning_rate": 1.889264581572274e-06, + "loss": 0.4635, + "step": 11383, + "task_loss": 1.2600687742233276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5550791621208191, + "epoch": 9.62, + "learning_rate": 1.8850380388841928e-06, + "loss": 0.4827, + "step": 11384, + "task_loss": 0.28835129737854004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42487403750419617, + "epoch": 9.62, + "learning_rate": 1.8808114961961117e-06, + "loss": 0.5077, + "step": 11385, + "task_loss": 0.8663027882575989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3729102611541748, + "epoch": 9.62, + "learning_rate": 1.8765849535080306e-06, + "loss": 0.4624, + "step": 11386, + "task_loss": 0.9108969569206238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3118417263031006, + "epoch": 9.63, + "learning_rate": 1.8723584108199492e-06, + "loss": 0.3623, + "step": 11387, + "task_loss": 0.9128273129463196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35142892599105835, + "epoch": 9.63, + "learning_rate": 1.8681318681318681e-06, + "loss": 0.2891, + "step": 11388, + "task_loss": 0.3188212215900421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49902379512786865, + "epoch": 9.63, + "learning_rate": 1.863905325443787e-06, + "loss": 0.36, + "step": 11389, + "task_loss": 0.92162024974823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3398239314556122, + "epoch": 9.63, + "learning_rate": 1.859678782755706e-06, + "loss": 0.3131, + "step": 11390, + "task_loss": 0.29287195205688477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.67226243019104, + "epoch": 9.63, + "learning_rate": 1.8554522400676246e-06, + "loss": 0.4469, + "step": 11391, + "task_loss": 1.0422295331954956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27652254700660706, + "epoch": 9.63, + "learning_rate": 1.8512256973795435e-06, + "loss": 0.3273, + "step": 11392, + "task_loss": 0.2765454649925232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2788563370704651, + "epoch": 9.63, + "learning_rate": 1.8469991546914624e-06, + "loss": 0.4269, + "step": 11393, + "task_loss": 0.4959866404533386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18035463988780975, + "epoch": 9.63, + "learning_rate": 1.8427726120033813e-06, + "loss": 0.2439, + "step": 11394, + "task_loss": 0.3229225277900696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37946566939353943, + "epoch": 9.63, + "learning_rate": 1.8385460693153e-06, + "loss": 0.3984, + "step": 11395, + "task_loss": 0.28168636560440063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22433197498321533, + "epoch": 9.63, + "learning_rate": 1.8343195266272188e-06, + "loss": 0.3312, + "step": 11396, + "task_loss": 0.0423276424407959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3201439380645752, + "epoch": 9.63, + "learning_rate": 1.8300929839391377e-06, + "loss": 0.3414, + "step": 11397, + "task_loss": 0.2736188769340515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18343813717365265, + "epoch": 9.63, + "learning_rate": 1.8258664412510566e-06, + "loss": 0.2783, + "step": 11398, + "task_loss": 0.8561438918113708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3910684287548065, + "epoch": 9.64, + "learning_rate": 1.8216398985629753e-06, + "loss": 0.3657, + "step": 11399, + "task_loss": 0.3508215844631195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2434781938791275, + "epoch": 9.64, + "learning_rate": 1.8174133558748946e-06, + "loss": 0.4263, + "step": 11400, + "task_loss": 0.5328945517539978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2664779722690582, + "epoch": 9.64, + "learning_rate": 1.8131868131868135e-06, + "loss": 0.2873, + "step": 11401, + "task_loss": 0.18951082229614258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3395950198173523, + "epoch": 9.64, + "learning_rate": 1.8089602704987322e-06, + "loss": 0.3927, + "step": 11402, + "task_loss": 0.22719445824623108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28845998644828796, + "epoch": 9.64, + "learning_rate": 1.804733727810651e-06, + "loss": 0.3296, + "step": 11403, + "task_loss": 0.6665111184120178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3056619167327881, + "epoch": 9.64, + "learning_rate": 1.80050718512257e-06, + "loss": 0.4058, + "step": 11404, + "task_loss": 0.7225939631462097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1704697459936142, + "epoch": 9.64, + "learning_rate": 1.7962806424344889e-06, + "loss": 0.3247, + "step": 11405, + "task_loss": 0.7161139249801636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.235569030046463, + "epoch": 9.64, + "learning_rate": 1.7920540997464076e-06, + "loss": 0.3337, + "step": 11406, + "task_loss": 0.21453885734081268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1402040719985962, + "epoch": 9.64, + "learning_rate": 1.7878275570583264e-06, + "loss": 0.3531, + "step": 11407, + "task_loss": 0.4320604205131531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2700716257095337, + "epoch": 9.64, + "learning_rate": 1.7836010143702453e-06, + "loss": 0.4085, + "step": 11408, + "task_loss": 0.4586601257324219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26214319467544556, + "epoch": 9.64, + "learning_rate": 1.7793744716821642e-06, + "loss": 0.3216, + "step": 11409, + "task_loss": 0.22184021770954132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3785628080368042, + "epoch": 9.64, + "learning_rate": 1.775147928994083e-06, + "loss": 0.2913, + "step": 11410, + "task_loss": 0.23283220827579498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2353890985250473, + "epoch": 9.65, + "learning_rate": 1.7709213863060018e-06, + "loss": 0.3183, + "step": 11411, + "task_loss": 0.3049878180027008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18935465812683105, + "epoch": 9.65, + "learning_rate": 1.7666948436179207e-06, + "loss": 0.3174, + "step": 11412, + "task_loss": 0.7596449851989746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38802996277809143, + "epoch": 9.65, + "learning_rate": 1.7624683009298396e-06, + "loss": 0.3615, + "step": 11413, + "task_loss": 0.6841916441917419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4733309745788574, + "epoch": 9.65, + "learning_rate": 1.7582417582417583e-06, + "loss": 0.3877, + "step": 11414, + "task_loss": 0.3482346534729004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2754071056842804, + "epoch": 9.65, + "learning_rate": 1.7540152155536772e-06, + "loss": 0.3411, + "step": 11415, + "task_loss": 1.4552600383758545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3559128940105438, + "epoch": 9.65, + "learning_rate": 1.749788672865596e-06, + "loss": 0.396, + "step": 11416, + "task_loss": 0.563217043876648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2073933482170105, + "epoch": 9.65, + "learning_rate": 1.745562130177515e-06, + "loss": 0.3898, + "step": 11417, + "task_loss": 0.4485001862049103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4541964530944824, + "epoch": 9.65, + "learning_rate": 1.7413355874894336e-06, + "loss": 0.3379, + "step": 11418, + "task_loss": 0.17898324131965637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3260287642478943, + "epoch": 9.65, + "learning_rate": 1.7371090448013525e-06, + "loss": 0.4064, + "step": 11419, + "task_loss": 0.9959813356399536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4553569555282593, + "epoch": 9.65, + "learning_rate": 1.7328825021132714e-06, + "loss": 0.3656, + "step": 11420, + "task_loss": 0.9632174968719482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37856072187423706, + "epoch": 9.65, + "learning_rate": 1.7286559594251903e-06, + "loss": 0.3042, + "step": 11421, + "task_loss": 0.4925137758255005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3226199746131897, + "epoch": 9.65, + "learning_rate": 1.724429416737109e-06, + "loss": 0.338, + "step": 11422, + "task_loss": 0.19618257880210876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40906471014022827, + "epoch": 9.66, + "learning_rate": 1.7202028740490279e-06, + "loss": 0.3535, + "step": 11423, + "task_loss": 0.9799938201904297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.288569837808609, + "epoch": 9.66, + "learning_rate": 1.7159763313609468e-06, + "loss": 0.3266, + "step": 11424, + "task_loss": 0.462087482213974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33994609117507935, + "epoch": 9.66, + "learning_rate": 1.7117497886728656e-06, + "loss": 0.4332, + "step": 11425, + "task_loss": 1.495188593864441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2971399426460266, + "epoch": 9.66, + "learning_rate": 1.7075232459847843e-06, + "loss": 0.3769, + "step": 11426, + "task_loss": 0.6118338704109192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15497329831123352, + "epoch": 9.66, + "learning_rate": 1.7032967032967032e-06, + "loss": 0.3367, + "step": 11427, + "task_loss": 0.8847630620002747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3680981993675232, + "epoch": 9.66, + "learning_rate": 1.6990701606086221e-06, + "loss": 0.3761, + "step": 11428, + "task_loss": 0.3482116460800171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3056458830833435, + "epoch": 9.66, + "learning_rate": 1.6948436179205412e-06, + "loss": 0.3412, + "step": 11429, + "task_loss": 0.7409135103225708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23351219296455383, + "epoch": 9.66, + "learning_rate": 1.69061707523246e-06, + "loss": 0.337, + "step": 11430, + "task_loss": 0.1401732861995697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3785826563835144, + "epoch": 9.66, + "learning_rate": 1.686390532544379e-06, + "loss": 0.3072, + "step": 11431, + "task_loss": 0.41874128580093384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41313013434410095, + "epoch": 9.66, + "learning_rate": 1.6821639898562977e-06, + "loss": 0.3431, + "step": 11432, + "task_loss": 0.8204946517944336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.417153537273407, + "epoch": 9.66, + "learning_rate": 1.6779374471682166e-06, + "loss": 0.3799, + "step": 11433, + "task_loss": 0.6660819053649902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1994740515947342, + "epoch": 9.66, + "learning_rate": 1.6737109044801355e-06, + "loss": 0.3536, + "step": 11434, + "task_loss": 0.5244724154472351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23480790853500366, + "epoch": 9.67, + "learning_rate": 1.6694843617920543e-06, + "loss": 0.3074, + "step": 11435, + "task_loss": 0.03405332192778587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.224386528134346, + "epoch": 9.67, + "learning_rate": 1.665257819103973e-06, + "loss": 0.4569, + "step": 11436, + "task_loss": 0.3570059537887573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.512499988079071, + "epoch": 9.67, + "learning_rate": 1.661031276415892e-06, + "loss": 0.3387, + "step": 11437, + "task_loss": 1.376373291015625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3143555223941803, + "epoch": 9.67, + "learning_rate": 1.6568047337278108e-06, + "loss": 0.3209, + "step": 11438, + "task_loss": 0.8242653012275696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4584347605705261, + "epoch": 9.67, + "learning_rate": 1.6525781910397297e-06, + "loss": 0.3966, + "step": 11439, + "task_loss": 1.094449520111084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2787601053714752, + "epoch": 9.67, + "learning_rate": 1.6483516483516484e-06, + "loss": 0.3488, + "step": 11440, + "task_loss": 0.29106733202934265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26899731159210205, + "epoch": 9.67, + "learning_rate": 1.6441251056635673e-06, + "loss": 0.3397, + "step": 11441, + "task_loss": 0.5184072256088257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27530956268310547, + "epoch": 9.67, + "learning_rate": 1.6398985629754862e-06, + "loss": 0.395, + "step": 11442, + "task_loss": 0.6371501684188843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43390512466430664, + "epoch": 9.67, + "learning_rate": 1.635672020287405e-06, + "loss": 0.3593, + "step": 11443, + "task_loss": 0.7042547464370728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4595538377761841, + "epoch": 9.67, + "learning_rate": 1.6314454775993237e-06, + "loss": 0.4018, + "step": 11444, + "task_loss": 1.0405006408691406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24976295232772827, + "epoch": 9.67, + "learning_rate": 1.6272189349112426e-06, + "loss": 0.3236, + "step": 11445, + "task_loss": 0.5751939415931702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26376330852508545, + "epoch": 9.67, + "learning_rate": 1.6229923922231615e-06, + "loss": 0.3831, + "step": 11446, + "task_loss": 0.9888198375701904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3995562493801117, + "epoch": 9.68, + "learning_rate": 1.6187658495350804e-06, + "loss": 0.4051, + "step": 11447, + "task_loss": 0.503148078918457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2619239091873169, + "epoch": 9.68, + "learning_rate": 1.614539306846999e-06, + "loss": 0.3027, + "step": 11448, + "task_loss": 0.42259958386421204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2721227705478668, + "epoch": 9.68, + "learning_rate": 1.610312764158918e-06, + "loss": 0.3326, + "step": 11449, + "task_loss": 0.402411550283432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21126922965049744, + "epoch": 9.68, + "learning_rate": 1.6060862214708369e-06, + "loss": 0.325, + "step": 11450, + "task_loss": 0.44474369287490845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2600010335445404, + "epoch": 9.68, + "learning_rate": 1.6018596787827558e-06, + "loss": 0.3593, + "step": 11451, + "task_loss": 0.6624702215194702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20123682916164398, + "epoch": 9.68, + "learning_rate": 1.5976331360946744e-06, + "loss": 0.3656, + "step": 11452, + "task_loss": 0.6612902879714966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.15425634384155273, + "epoch": 9.68, + "learning_rate": 1.5934065934065933e-06, + "loss": 0.4062, + "step": 11453, + "task_loss": 1.0239975452423096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38849514722824097, + "epoch": 9.68, + "learning_rate": 1.5891800507185122e-06, + "loss": 0.3113, + "step": 11454, + "task_loss": 0.1842092126607895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2801484167575836, + "epoch": 9.68, + "learning_rate": 1.5849535080304311e-06, + "loss": 0.3237, + "step": 11455, + "task_loss": 0.7476338744163513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16617603600025177, + "epoch": 9.68, + "learning_rate": 1.5807269653423498e-06, + "loss": 0.3585, + "step": 11456, + "task_loss": 0.1819252073764801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3295549750328064, + "epoch": 9.68, + "learning_rate": 1.5765004226542687e-06, + "loss": 0.411, + "step": 11457, + "task_loss": 0.12821581959724426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20386052131652832, + "epoch": 9.69, + "learning_rate": 1.5722738799661876e-06, + "loss": 0.3095, + "step": 11458, + "task_loss": 0.04509374499320984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3831862807273865, + "epoch": 9.69, + "learning_rate": 1.5680473372781067e-06, + "loss": 0.2851, + "step": 11459, + "task_loss": 0.5460574626922607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3222775459289551, + "epoch": 9.69, + "learning_rate": 1.5638207945900256e-06, + "loss": 0.3737, + "step": 11460, + "task_loss": 0.4812104105949402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2854434549808502, + "epoch": 9.69, + "learning_rate": 1.5595942519019443e-06, + "loss": 0.2827, + "step": 11461, + "task_loss": 0.6966005563735962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2175339162349701, + "epoch": 9.69, + "learning_rate": 1.5553677092138632e-06, + "loss": 0.3884, + "step": 11462, + "task_loss": 0.4979347288608551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24974189698696136, + "epoch": 9.69, + "learning_rate": 1.5511411665257818e-06, + "loss": 0.2975, + "step": 11463, + "task_loss": 0.14532750844955444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5196920037269592, + "epoch": 9.69, + "learning_rate": 1.5469146238377007e-06, + "loss": 0.438, + "step": 11464, + "task_loss": 1.0485488176345825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2503054440021515, + "epoch": 9.69, + "learning_rate": 1.5426880811496196e-06, + "loss": 0.312, + "step": 11465, + "task_loss": 0.7960480451583862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.8321512937545776, + "epoch": 9.69, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.5434, + "step": 11466, + "task_loss": 0.5021150708198547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29590654373168945, + "epoch": 9.69, + "learning_rate": 1.5342349957734574e-06, + "loss": 0.3848, + "step": 11467, + "task_loss": 0.4462801218032837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4700407385826111, + "epoch": 9.69, + "learning_rate": 1.5300084530853763e-06, + "loss": 0.4554, + "step": 11468, + "task_loss": 0.9669203758239746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.435629278421402, + "epoch": 9.69, + "learning_rate": 1.5257819103972952e-06, + "loss": 0.3604, + "step": 11469, + "task_loss": 1.2691365480422974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6612328290939331, + "epoch": 9.7, + "learning_rate": 1.521555367709214e-06, + "loss": 0.4595, + "step": 11470, + "task_loss": 0.7653945684432983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4318428635597229, + "epoch": 9.7, + "learning_rate": 1.5173288250211328e-06, + "loss": 0.4243, + "step": 11471, + "task_loss": 0.9471795558929443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5161401629447937, + "epoch": 9.7, + "learning_rate": 1.5131022823330516e-06, + "loss": 0.3618, + "step": 11472, + "task_loss": 0.3132745325565338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4227718412876129, + "epoch": 9.7, + "learning_rate": 1.5088757396449705e-06, + "loss": 0.3155, + "step": 11473, + "task_loss": 0.30955770611763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20657232403755188, + "epoch": 9.7, + "learning_rate": 1.5046491969568894e-06, + "loss": 0.2378, + "step": 11474, + "task_loss": 0.31897175312042236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5543172359466553, + "epoch": 9.7, + "learning_rate": 1.5004226542688081e-06, + "loss": 0.4827, + "step": 11475, + "task_loss": 0.2159649133682251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44140732288360596, + "epoch": 9.7, + "learning_rate": 1.496196111580727e-06, + "loss": 0.3811, + "step": 11476, + "task_loss": 0.4367848038673401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.284241259098053, + "epoch": 9.7, + "learning_rate": 1.4919695688926459e-06, + "loss": 0.373, + "step": 11477, + "task_loss": 0.2689046859741211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3105393052101135, + "epoch": 9.7, + "learning_rate": 1.4877430262045648e-06, + "loss": 0.3292, + "step": 11478, + "task_loss": 0.14010834693908691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40796399116516113, + "epoch": 9.7, + "learning_rate": 1.4835164835164835e-06, + "loss": 0.3892, + "step": 11479, + "task_loss": 0.40134066343307495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5347166657447815, + "epoch": 9.7, + "learning_rate": 1.4792899408284024e-06, + "loss": 0.3992, + "step": 11480, + "task_loss": 1.2211244106292725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3464749753475189, + "epoch": 9.7, + "learning_rate": 1.4750633981403215e-06, + "loss": 0.4054, + "step": 11481, + "task_loss": 1.366295576095581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2973012328147888, + "epoch": 9.71, + "learning_rate": 1.4708368554522401e-06, + "loss": 0.4435, + "step": 11482, + "task_loss": 0.32228514552116394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25689366459846497, + "epoch": 9.71, + "learning_rate": 1.466610312764159e-06, + "loss": 0.3382, + "step": 11483, + "task_loss": 0.8613317608833313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31057053804397583, + "epoch": 9.71, + "learning_rate": 1.462383770076078e-06, + "loss": 0.3769, + "step": 11484, + "task_loss": 0.31447872519493103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37916329503059387, + "epoch": 9.71, + "learning_rate": 1.4581572273879968e-06, + "loss": 0.3606, + "step": 11485, + "task_loss": 0.511221170425415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2058028280735016, + "epoch": 9.71, + "learning_rate": 1.4539306846999155e-06, + "loss": 0.275, + "step": 11486, + "task_loss": 0.724391758441925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16803047060966492, + "epoch": 9.71, + "learning_rate": 1.4497041420118344e-06, + "loss": 0.3363, + "step": 11487, + "task_loss": 0.06658004224300385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4559987783432007, + "epoch": 9.71, + "learning_rate": 1.4454775993237533e-06, + "loss": 0.3721, + "step": 11488, + "task_loss": 0.9642744064331055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3305496573448181, + "epoch": 9.71, + "learning_rate": 1.4412510566356722e-06, + "loss": 0.391, + "step": 11489, + "task_loss": 0.30208584666252136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20133759081363678, + "epoch": 9.71, + "learning_rate": 1.4370245139475908e-06, + "loss": 0.3551, + "step": 11490, + "task_loss": 0.6635864973068237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5491741299629211, + "epoch": 9.71, + "learning_rate": 1.4327979712595097e-06, + "loss": 0.5044, + "step": 11491, + "task_loss": 0.6537925004959106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27279967069625854, + "epoch": 9.71, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.2547, + "step": 11492, + "task_loss": 0.2347949892282486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25307825207710266, + "epoch": 9.71, + "learning_rate": 1.4243448858833475e-06, + "loss": 0.3161, + "step": 11493, + "task_loss": 0.0667300820350647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5817744731903076, + "epoch": 9.72, + "learning_rate": 1.4201183431952662e-06, + "loss": 0.4563, + "step": 11494, + "task_loss": 1.0616064071655273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3033926486968994, + "epoch": 9.72, + "learning_rate": 1.415891800507185e-06, + "loss": 0.3704, + "step": 11495, + "task_loss": 1.3076547384262085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39327317476272583, + "epoch": 9.72, + "learning_rate": 1.4116652578191042e-06, + "loss": 0.3331, + "step": 11496, + "task_loss": 0.31131288409233093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20547592639923096, + "epoch": 9.72, + "learning_rate": 1.4074387151310229e-06, + "loss": 0.2665, + "step": 11497, + "task_loss": 0.43815693259239197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16236069798469543, + "epoch": 9.72, + "learning_rate": 1.4032121724429418e-06, + "loss": 0.4039, + "step": 11498, + "task_loss": 0.5727542638778687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.46614569425582886, + "epoch": 9.72, + "learning_rate": 1.3989856297548607e-06, + "loss": 0.4063, + "step": 11499, + "task_loss": 0.8678928017616272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22096040844917297, + "epoch": 9.72, + "learning_rate": 1.3947590870667795e-06, + "loss": 0.312, + "step": 11500, + "task_loss": 0.6331918239593506 + }, + { + "epoch": 9.72, + "eval_accuracy": 0.9171881188118812, + "eval_loss": 0.24021731317043304, + "eval_runtime": 226.013, + "eval_samples_per_second": 111.719, + "eval_steps_per_second": 0.876, + "step": 11500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42182546854019165, + "epoch": 9.72, + "learning_rate": 1.3905325443786982e-06, + "loss": 0.3711, + "step": 11501, + "task_loss": 0.6707150936126709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2353561818599701, + "epoch": 9.72, + "learning_rate": 1.3863060016906171e-06, + "loss": 0.3023, + "step": 11502, + "task_loss": 0.38667890429496765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2755362391471863, + "epoch": 9.72, + "learning_rate": 1.382079459002536e-06, + "loss": 0.3937, + "step": 11503, + "task_loss": 0.7184953689575195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35116761922836304, + "epoch": 9.72, + "learning_rate": 1.377852916314455e-06, + "loss": 0.3963, + "step": 11504, + "task_loss": 0.492603063583374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3725813031196594, + "epoch": 9.72, + "learning_rate": 1.3736263736263736e-06, + "loss": 0.2927, + "step": 11505, + "task_loss": 0.6261778473854065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18086005747318268, + "epoch": 9.73, + "learning_rate": 1.3693998309382925e-06, + "loss": 0.2974, + "step": 11506, + "task_loss": 0.74566650390625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2075197696685791, + "epoch": 9.73, + "learning_rate": 1.3651732882502114e-06, + "loss": 0.2489, + "step": 11507, + "task_loss": 0.2277413159608841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4005768299102783, + "epoch": 9.73, + "learning_rate": 1.3609467455621303e-06, + "loss": 0.3565, + "step": 11508, + "task_loss": 1.1678518056869507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29778093099594116, + "epoch": 9.73, + "learning_rate": 1.356720202874049e-06, + "loss": 0.3013, + "step": 11509, + "task_loss": 0.4046175479888916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3814990222454071, + "epoch": 9.73, + "learning_rate": 1.3524936601859678e-06, + "loss": 0.42, + "step": 11510, + "task_loss": 0.5970731973648071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5035764575004578, + "epoch": 9.73, + "learning_rate": 1.348267117497887e-06, + "loss": 0.4031, + "step": 11511, + "task_loss": 1.3242605924606323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6035146713256836, + "epoch": 9.73, + "learning_rate": 1.3440405748098056e-06, + "loss": 0.4419, + "step": 11512, + "task_loss": 0.5378085970878601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38605213165283203, + "epoch": 9.73, + "learning_rate": 1.3398140321217245e-06, + "loss": 0.4696, + "step": 11513, + "task_loss": 0.40572240948677063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33871760964393616, + "epoch": 9.73, + "learning_rate": 1.3355874894336434e-06, + "loss": 0.4162, + "step": 11514, + "task_loss": 0.8985888361930847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2023114413022995, + "epoch": 9.73, + "learning_rate": 1.3313609467455623e-06, + "loss": 0.2346, + "step": 11515, + "task_loss": 0.6903096437454224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28632354736328125, + "epoch": 9.73, + "learning_rate": 1.327134404057481e-06, + "loss": 0.2907, + "step": 11516, + "task_loss": 0.5053189396858215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2577715218067169, + "epoch": 9.73, + "learning_rate": 1.3229078613693999e-06, + "loss": 0.3585, + "step": 11517, + "task_loss": 0.10421779751777649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49105221033096313, + "epoch": 9.74, + "learning_rate": 1.3186813186813187e-06, + "loss": 0.3824, + "step": 11518, + "task_loss": 0.41296112537384033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21086427569389343, + "epoch": 9.74, + "learning_rate": 1.3144547759932376e-06, + "loss": 0.3201, + "step": 11519, + "task_loss": 0.398397833108902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21979159116744995, + "epoch": 9.74, + "learning_rate": 1.3102282333051563e-06, + "loss": 0.3003, + "step": 11520, + "task_loss": 0.052946317940950394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3397313356399536, + "epoch": 9.74, + "learning_rate": 1.3060016906170752e-06, + "loss": 0.4624, + "step": 11521, + "task_loss": 0.9207710027694702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5669938325881958, + "epoch": 9.74, + "learning_rate": 1.301775147928994e-06, + "loss": 0.3863, + "step": 11522, + "task_loss": 0.9419997334480286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18719340860843658, + "epoch": 9.74, + "learning_rate": 1.297548605240913e-06, + "loss": 0.3273, + "step": 11523, + "task_loss": 0.4451925456523895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3807867765426636, + "epoch": 9.74, + "learning_rate": 1.2933220625528317e-06, + "loss": 0.3498, + "step": 11524, + "task_loss": 0.24388277530670166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5017861723899841, + "epoch": 9.74, + "learning_rate": 1.2890955198647506e-06, + "loss": 0.3739, + "step": 11525, + "task_loss": 0.5985338687896729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4993566572666168, + "epoch": 9.74, + "learning_rate": 1.2848689771766697e-06, + "loss": 0.3549, + "step": 11526, + "task_loss": 0.7203245162963867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18379151821136475, + "epoch": 9.74, + "learning_rate": 1.2806424344885886e-06, + "loss": 0.2299, + "step": 11527, + "task_loss": 0.12222649157047272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28931543231010437, + "epoch": 9.74, + "learning_rate": 1.2764158918005072e-06, + "loss": 0.3222, + "step": 11528, + "task_loss": 0.6729823350906372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3812021017074585, + "epoch": 9.75, + "learning_rate": 1.2721893491124261e-06, + "loss": 0.285, + "step": 11529, + "task_loss": 0.5738945007324219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2791770398616791, + "epoch": 9.75, + "learning_rate": 1.267962806424345e-06, + "loss": 0.2271, + "step": 11530, + "task_loss": 0.3496460020542145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23038524389266968, + "epoch": 9.75, + "learning_rate": 1.263736263736264e-06, + "loss": 0.3871, + "step": 11531, + "task_loss": 0.3085528016090393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3166860342025757, + "epoch": 9.75, + "learning_rate": 1.2595097210481826e-06, + "loss": 0.2987, + "step": 11532, + "task_loss": 0.5186700820922852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31599241495132446, + "epoch": 9.75, + "learning_rate": 1.2552831783601015e-06, + "loss": 0.276, + "step": 11533, + "task_loss": 0.32217615842819214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1491033434867859, + "epoch": 9.75, + "learning_rate": 1.2510566356720204e-06, + "loss": 0.2725, + "step": 11534, + "task_loss": 0.1769881248474121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38832467794418335, + "epoch": 9.75, + "learning_rate": 1.2468300929839393e-06, + "loss": 0.3046, + "step": 11535, + "task_loss": 1.1538777351379395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3228498697280884, + "epoch": 9.75, + "learning_rate": 1.242603550295858e-06, + "loss": 0.3606, + "step": 11536, + "task_loss": 0.46882614493370056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23697489500045776, + "epoch": 9.75, + "learning_rate": 1.2383770076077768e-06, + "loss": 0.293, + "step": 11537, + "task_loss": 0.5640213489532471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6739101409912109, + "epoch": 9.75, + "learning_rate": 1.2341504649196957e-06, + "loss": 0.4538, + "step": 11538, + "task_loss": 0.6842974424362183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2328580766916275, + "epoch": 9.75, + "learning_rate": 1.2299239222316146e-06, + "loss": 0.3361, + "step": 11539, + "task_loss": 0.2995491623878479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43693795800209045, + "epoch": 9.75, + "learning_rate": 1.2256973795435333e-06, + "loss": 0.4444, + "step": 11540, + "task_loss": 1.7200641632080078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16191980242729187, + "epoch": 9.76, + "learning_rate": 1.2214708368554524e-06, + "loss": 0.3562, + "step": 11541, + "task_loss": 0.2957480847835541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3020510971546173, + "epoch": 9.76, + "learning_rate": 1.2172442941673713e-06, + "loss": 0.427, + "step": 11542, + "task_loss": 0.89032381772995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4471988081932068, + "epoch": 9.76, + "learning_rate": 1.21301775147929e-06, + "loss": 0.381, + "step": 11543, + "task_loss": 0.6252469420433044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2768203020095825, + "epoch": 9.76, + "learning_rate": 1.2087912087912089e-06, + "loss": 0.2695, + "step": 11544, + "task_loss": 0.38265687227249146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21978306770324707, + "epoch": 9.76, + "learning_rate": 1.2045646661031278e-06, + "loss": 0.2546, + "step": 11545, + "task_loss": 0.4047967493534088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33415138721466064, + "epoch": 9.76, + "learning_rate": 1.2003381234150467e-06, + "loss": 0.4065, + "step": 11546, + "task_loss": 0.6916629076004028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5713497996330261, + "epoch": 9.76, + "learning_rate": 1.1961115807269653e-06, + "loss": 0.4325, + "step": 11547, + "task_loss": 0.43093061447143555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1856275349855423, + "epoch": 9.76, + "learning_rate": 1.1918850380388842e-06, + "loss": 0.3619, + "step": 11548, + "task_loss": 0.7085734605789185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3406912684440613, + "epoch": 9.76, + "learning_rate": 1.1876584953508031e-06, + "loss": 0.3242, + "step": 11549, + "task_loss": 0.522082507610321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.332335889339447, + "epoch": 9.76, + "learning_rate": 1.183431952662722e-06, + "loss": 0.3338, + "step": 11550, + "task_loss": 0.4854016900062561 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22885192930698395, + "epoch": 9.76, + "learning_rate": 1.1792054099746407e-06, + "loss": 0.305, + "step": 11551, + "task_loss": 0.6380324363708496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3433436453342438, + "epoch": 9.76, + "learning_rate": 1.1749788672865596e-06, + "loss": 0.3856, + "step": 11552, + "task_loss": 0.5762419700622559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3392272889614105, + "epoch": 9.77, + "learning_rate": 1.1707523245984785e-06, + "loss": 0.3263, + "step": 11553, + "task_loss": 0.38463836908340454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34654638171195984, + "epoch": 9.77, + "learning_rate": 1.1665257819103974e-06, + "loss": 0.3287, + "step": 11554, + "task_loss": 0.44732412695884705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6483772397041321, + "epoch": 9.77, + "learning_rate": 1.162299239222316e-06, + "loss": 0.4443, + "step": 11555, + "task_loss": 0.9998729228973389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5327861905097961, + "epoch": 9.77, + "learning_rate": 1.1580726965342351e-06, + "loss": 0.3523, + "step": 11556, + "task_loss": 0.38029369711875916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31533241271972656, + "epoch": 9.77, + "learning_rate": 1.153846153846154e-06, + "loss": 0.3954, + "step": 11557, + "task_loss": 0.6218183636665344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23144280910491943, + "epoch": 9.77, + "learning_rate": 1.1496196111580727e-06, + "loss": 0.3568, + "step": 11558, + "task_loss": 0.5519044995307922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2794681489467621, + "epoch": 9.77, + "learning_rate": 1.1453930684699916e-06, + "loss": 0.4217, + "step": 11559, + "task_loss": 0.5025290846824646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19840696454048157, + "epoch": 9.77, + "learning_rate": 1.1411665257819105e-06, + "loss": 0.2872, + "step": 11560, + "task_loss": 0.402765154838562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26462483406066895, + "epoch": 9.77, + "learning_rate": 1.1369399830938294e-06, + "loss": 0.37, + "step": 11561, + "task_loss": 0.3709114193916321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4361591935157776, + "epoch": 9.77, + "learning_rate": 1.132713440405748e-06, + "loss": 0.4948, + "step": 11562, + "task_loss": 0.7134681344032288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4421406686306, + "epoch": 9.77, + "learning_rate": 1.128486897717667e-06, + "loss": 0.3297, + "step": 11563, + "task_loss": 0.9490401744842529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35591650009155273, + "epoch": 9.77, + "learning_rate": 1.1242603550295859e-06, + "loss": 0.361, + "step": 11564, + "task_loss": 0.9820961952209473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32459041476249695, + "epoch": 9.78, + "learning_rate": 1.1200338123415047e-06, + "loss": 0.3816, + "step": 11565, + "task_loss": 0.3405025899410248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2934322655200958, + "epoch": 9.78, + "learning_rate": 1.1158072696534234e-06, + "loss": 0.3137, + "step": 11566, + "task_loss": 0.9034987688064575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36788052320480347, + "epoch": 9.78, + "learning_rate": 1.1115807269653423e-06, + "loss": 0.2951, + "step": 11567, + "task_loss": 0.20638689398765564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3218073844909668, + "epoch": 9.78, + "learning_rate": 1.1073541842772612e-06, + "loss": 0.299, + "step": 11568, + "task_loss": 0.36746034026145935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5426002740859985, + "epoch": 9.78, + "learning_rate": 1.10312764158918e-06, + "loss": 0.385, + "step": 11569, + "task_loss": 0.9694886207580566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3522389233112335, + "epoch": 9.78, + "learning_rate": 1.098901098901099e-06, + "loss": 0.4498, + "step": 11570, + "task_loss": 1.200997233390808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.304076611995697, + "epoch": 9.78, + "learning_rate": 1.0946745562130179e-06, + "loss": 0.3211, + "step": 11571, + "task_loss": 0.7263556122779846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4278223216533661, + "epoch": 9.78, + "learning_rate": 1.0904480135249368e-06, + "loss": 0.3513, + "step": 11572, + "task_loss": 0.6221022009849548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.266663134098053, + "epoch": 9.78, + "learning_rate": 1.0862214708368555e-06, + "loss": 0.3011, + "step": 11573, + "task_loss": 0.7180951833724976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2807101905345917, + "epoch": 9.78, + "learning_rate": 1.0819949281487743e-06, + "loss": 0.3923, + "step": 11574, + "task_loss": 0.8153069019317627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3259927034378052, + "epoch": 9.78, + "learning_rate": 1.0777683854606932e-06, + "loss": 0.3119, + "step": 11575, + "task_loss": 0.6077529191970825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3847094178199768, + "epoch": 9.78, + "learning_rate": 1.0735418427726121e-06, + "loss": 0.3551, + "step": 11576, + "task_loss": 1.1541520357131958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.41597694158554077, + "epoch": 9.79, + "learning_rate": 1.0693153000845308e-06, + "loss": 0.3672, + "step": 11577, + "task_loss": 0.5734269618988037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2670394480228424, + "epoch": 9.79, + "learning_rate": 1.0650887573964497e-06, + "loss": 0.3061, + "step": 11578, + "task_loss": 0.5403621196746826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3795333504676819, + "epoch": 9.79, + "learning_rate": 1.0608622147083686e-06, + "loss": 0.3599, + "step": 11579, + "task_loss": 0.6898123025894165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4665977954864502, + "epoch": 9.79, + "learning_rate": 1.0566356720202875e-06, + "loss": 0.4495, + "step": 11580, + "task_loss": 0.9498779773712158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3978946805000305, + "epoch": 9.79, + "learning_rate": 1.0524091293322062e-06, + "loss": 0.4799, + "step": 11581, + "task_loss": 0.9153242707252502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2683635354042053, + "epoch": 9.79, + "learning_rate": 1.048182586644125e-06, + "loss": 0.3447, + "step": 11582, + "task_loss": 0.37186333537101746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21465590596199036, + "epoch": 9.79, + "learning_rate": 1.043956043956044e-06, + "loss": 0.3359, + "step": 11583, + "task_loss": 0.06667748838663101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27030330896377563, + "epoch": 9.79, + "learning_rate": 1.0397295012679628e-06, + "loss": 0.351, + "step": 11584, + "task_loss": 0.7504514455795288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18166187405586243, + "epoch": 9.79, + "learning_rate": 1.0355029585798817e-06, + "loss": 0.2479, + "step": 11585, + "task_loss": 0.399221807718277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3939642906188965, + "epoch": 9.79, + "learning_rate": 1.0312764158918006e-06, + "loss": 0.4348, + "step": 11586, + "task_loss": 0.229294091463089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3243493437767029, + "epoch": 9.79, + "learning_rate": 1.0270498732037195e-06, + "loss": 0.3211, + "step": 11587, + "task_loss": 1.0573574304580688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.45112287998199463, + "epoch": 9.79, + "learning_rate": 1.0228233305156384e-06, + "loss": 0.4166, + "step": 11588, + "task_loss": 0.5295839309692383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33278900384902954, + "epoch": 9.8, + "learning_rate": 1.018596787827557e-06, + "loss": 0.2867, + "step": 11589, + "task_loss": 0.3663014769554138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24121826887130737, + "epoch": 9.8, + "learning_rate": 1.014370245139476e-06, + "loss": 0.3537, + "step": 11590, + "task_loss": 0.5832351446151733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3033680319786072, + "epoch": 9.8, + "learning_rate": 1.0101437024513949e-06, + "loss": 0.4487, + "step": 11591, + "task_loss": 0.6016051769256592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3556279242038727, + "epoch": 9.8, + "learning_rate": 1.0059171597633138e-06, + "loss": 0.3237, + "step": 11592, + "task_loss": 0.15073081851005554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26481083035469055, + "epoch": 9.8, + "learning_rate": 1.0016906170752324e-06, + "loss": 0.3327, + "step": 11593, + "task_loss": 0.04011211916804314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2339170128107071, + "epoch": 9.8, + "learning_rate": 9.974640743871513e-07, + "loss": 0.3434, + "step": 11594, + "task_loss": 0.21927061676979065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2987024486064911, + "epoch": 9.8, + "learning_rate": 9.932375316990702e-07, + "loss": 0.3275, + "step": 11595, + "task_loss": 0.16293509304523468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3047736883163452, + "epoch": 9.8, + "learning_rate": 9.890109890109891e-07, + "loss": 0.3202, + "step": 11596, + "task_loss": 0.8584185242652893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25748249888420105, + "epoch": 9.8, + "learning_rate": 9.847844463229078e-07, + "loss": 0.3336, + "step": 11597, + "task_loss": 0.6500364542007446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24285843968391418, + "epoch": 9.8, + "learning_rate": 9.805579036348267e-07, + "loss": 0.4336, + "step": 11598, + "task_loss": 0.41308924555778503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2419148087501526, + "epoch": 9.8, + "learning_rate": 9.763313609467456e-07, + "loss": 0.3728, + "step": 11599, + "task_loss": 0.12282896041870117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29451170563697815, + "epoch": 9.81, + "learning_rate": 9.721048182586645e-07, + "loss": 0.3962, + "step": 11600, + "task_loss": 0.2076481133699417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4167931079864502, + "epoch": 9.81, + "learning_rate": 9.678782755705834e-07, + "loss": 0.4279, + "step": 11601, + "task_loss": 1.2686623334884644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38243359327316284, + "epoch": 9.81, + "learning_rate": 9.636517328825023e-07, + "loss": 0.5271, + "step": 11602, + "task_loss": 0.05235043913125992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2847591042518616, + "epoch": 9.81, + "learning_rate": 9.594251901944211e-07, + "loss": 0.3144, + "step": 11603, + "task_loss": 0.4899735152721405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2498854100704193, + "epoch": 9.81, + "learning_rate": 9.551986475063398e-07, + "loss": 0.3734, + "step": 11604, + "task_loss": 0.7233586311340332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2272118628025055, + "epoch": 9.81, + "learning_rate": 9.509721048182587e-07, + "loss": 0.3196, + "step": 11605, + "task_loss": 0.6324823498725891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24383017420768738, + "epoch": 9.81, + "learning_rate": 9.467455621301776e-07, + "loss": 0.3047, + "step": 11606, + "task_loss": 0.14181889593601227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.282803475856781, + "epoch": 9.81, + "learning_rate": 9.425190194420964e-07, + "loss": 0.3255, + "step": 11607, + "task_loss": 0.10189807415008545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2623617649078369, + "epoch": 9.81, + "learning_rate": 9.382924767540153e-07, + "loss": 0.4392, + "step": 11608, + "task_loss": 0.6769603490829468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24189922213554382, + "epoch": 9.81, + "learning_rate": 9.340659340659341e-07, + "loss": 0.3746, + "step": 11609, + "task_loss": 0.07162756472826004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2872876524925232, + "epoch": 9.81, + "learning_rate": 9.29839391377853e-07, + "loss": 0.2901, + "step": 11610, + "task_loss": 0.6414722800254822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4856971800327301, + "epoch": 9.81, + "learning_rate": 9.256128486897717e-07, + "loss": 0.3248, + "step": 11611, + "task_loss": 0.81694495677948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31776583194732666, + "epoch": 9.82, + "learning_rate": 9.213863060016906e-07, + "loss": 0.3435, + "step": 11612, + "task_loss": 0.27536746859550476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3352828621864319, + "epoch": 9.82, + "learning_rate": 9.171597633136094e-07, + "loss": 0.3293, + "step": 11613, + "task_loss": 0.23912334442138672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3049991726875305, + "epoch": 9.82, + "learning_rate": 9.129332206255283e-07, + "loss": 0.4625, + "step": 11614, + "task_loss": 0.8003309369087219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32282668352127075, + "epoch": 9.82, + "learning_rate": 9.087066779374473e-07, + "loss": 0.3723, + "step": 11615, + "task_loss": 0.7033539414405823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2525600790977478, + "epoch": 9.82, + "learning_rate": 9.044801352493661e-07, + "loss": 0.3721, + "step": 11616, + "task_loss": 0.699455738067627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25242024660110474, + "epoch": 9.82, + "learning_rate": 9.00253592561285e-07, + "loss": 0.2967, + "step": 11617, + "task_loss": 0.2816600501537323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.44912272691726685, + "epoch": 9.82, + "learning_rate": 8.960270498732038e-07, + "loss": 0.3173, + "step": 11618, + "task_loss": 1.7904939651489258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.31990697979927063, + "epoch": 9.82, + "learning_rate": 8.918005071851227e-07, + "loss": 0.2656, + "step": 11619, + "task_loss": 0.5626444220542908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.589996874332428, + "epoch": 9.82, + "learning_rate": 8.875739644970415e-07, + "loss": 0.4112, + "step": 11620, + "task_loss": 1.1437616348266602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3013782799243927, + "epoch": 9.82, + "learning_rate": 8.833474218089603e-07, + "loss": 0.3393, + "step": 11621, + "task_loss": 0.47835880517959595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14221367239952087, + "epoch": 9.82, + "learning_rate": 8.791208791208791e-07, + "loss": 0.2425, + "step": 11622, + "task_loss": 0.18015243113040924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14566174149513245, + "epoch": 9.82, + "learning_rate": 8.74894336432798e-07, + "loss": 0.2978, + "step": 11623, + "task_loss": 0.2991926074028015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5476205348968506, + "epoch": 9.83, + "learning_rate": 8.706677937447168e-07, + "loss": 0.4761, + "step": 11624, + "task_loss": 1.2383973598480225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22270378470420837, + "epoch": 9.83, + "learning_rate": 8.664412510566357e-07, + "loss": 0.2882, + "step": 11625, + "task_loss": 0.28122350573539734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24863888323307037, + "epoch": 9.83, + "learning_rate": 8.622147083685545e-07, + "loss": 0.3772, + "step": 11626, + "task_loss": 0.16144612431526184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4593784809112549, + "epoch": 9.83, + "learning_rate": 8.579881656804734e-07, + "loss": 0.3718, + "step": 11627, + "task_loss": 0.8021537065505981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36984097957611084, + "epoch": 9.83, + "learning_rate": 8.537616229923922e-07, + "loss": 0.4044, + "step": 11628, + "task_loss": 1.0335593223571777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49677038192749023, + "epoch": 9.83, + "learning_rate": 8.495350803043111e-07, + "loss": 0.3859, + "step": 11629, + "task_loss": 0.7184372544288635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34113413095474243, + "epoch": 9.83, + "learning_rate": 8.4530853761623e-07, + "loss": 0.402, + "step": 11630, + "task_loss": 0.2766909599304199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.14213956892490387, + "epoch": 9.83, + "learning_rate": 8.410819949281488e-07, + "loss": 0.3076, + "step": 11631, + "task_loss": 0.10918843746185303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3124306797981262, + "epoch": 9.83, + "learning_rate": 8.368554522400677e-07, + "loss": 0.4503, + "step": 11632, + "task_loss": 0.4812248945236206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34216147661209106, + "epoch": 9.83, + "learning_rate": 8.326289095519865e-07, + "loss": 0.3075, + "step": 11633, + "task_loss": 0.4338526427745819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4333937168121338, + "epoch": 9.83, + "learning_rate": 8.284023668639054e-07, + "loss": 0.3339, + "step": 11634, + "task_loss": 0.45242375135421753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19510699808597565, + "epoch": 9.83, + "learning_rate": 8.241758241758242e-07, + "loss": 0.3675, + "step": 11635, + "task_loss": 0.28190428018569946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3510788679122925, + "epoch": 9.84, + "learning_rate": 8.199492814877431e-07, + "loss": 0.4133, + "step": 11636, + "task_loss": 0.6353777647018433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36686286330223083, + "epoch": 9.84, + "learning_rate": 8.157227387996619e-07, + "loss": 0.3044, + "step": 11637, + "task_loss": 0.9839715957641602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38448283076286316, + "epoch": 9.84, + "learning_rate": 8.114961961115808e-07, + "loss": 0.3413, + "step": 11638, + "task_loss": 0.8712025284767151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24727946519851685, + "epoch": 9.84, + "learning_rate": 8.072696534234995e-07, + "loss": 0.2839, + "step": 11639, + "task_loss": 0.13456924259662628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.316354364156723, + "epoch": 9.84, + "learning_rate": 8.030431107354184e-07, + "loss": 0.2796, + "step": 11640, + "task_loss": 1.0290857553482056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2354334592819214, + "epoch": 9.84, + "learning_rate": 7.988165680473372e-07, + "loss": 0.385, + "step": 11641, + "task_loss": 0.41232091188430786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32979440689086914, + "epoch": 9.84, + "learning_rate": 7.945900253592561e-07, + "loss": 0.3401, + "step": 11642, + "task_loss": 0.4732211232185364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30502140522003174, + "epoch": 9.84, + "learning_rate": 7.903634826711749e-07, + "loss": 0.3362, + "step": 11643, + "task_loss": 0.3674145042896271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28016650676727295, + "epoch": 9.84, + "learning_rate": 7.861369399830938e-07, + "loss": 0.3047, + "step": 11644, + "task_loss": 0.2455063909292221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5164601802825928, + "epoch": 9.84, + "learning_rate": 7.819103972950128e-07, + "loss": 0.377, + "step": 11645, + "task_loss": 0.833278238773346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3164857029914856, + "epoch": 9.84, + "learning_rate": 7.776838546069316e-07, + "loss": 0.4618, + "step": 11646, + "task_loss": 1.531575083732605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2939262390136719, + "epoch": 9.84, + "learning_rate": 7.734573119188504e-07, + "loss": 0.3897, + "step": 11647, + "task_loss": 0.813907265663147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3977244198322296, + "epoch": 9.85, + "learning_rate": 7.692307692307694e-07, + "loss": 0.2994, + "step": 11648, + "task_loss": 0.45476028323173523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.293307363986969, + "epoch": 9.85, + "learning_rate": 7.650042265426881e-07, + "loss": 0.3045, + "step": 11649, + "task_loss": 0.3076777458190918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.470998078584671, + "epoch": 9.85, + "learning_rate": 7.60777683854607e-07, + "loss": 0.3801, + "step": 11650, + "task_loss": 0.5332295894622803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3325941860675812, + "epoch": 9.85, + "learning_rate": 7.565511411665258e-07, + "loss": 0.3193, + "step": 11651, + "task_loss": 0.42220330238342285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2850295305252075, + "epoch": 9.85, + "learning_rate": 7.523245984784447e-07, + "loss": 0.3157, + "step": 11652, + "task_loss": 0.8529147505760193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34869807958602905, + "epoch": 9.85, + "learning_rate": 7.480980557903635e-07, + "loss": 0.4398, + "step": 11653, + "task_loss": 0.6998187899589539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27215325832366943, + "epoch": 9.85, + "learning_rate": 7.438715131022824e-07, + "loss": 0.2025, + "step": 11654, + "task_loss": 1.2488270998001099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33452659845352173, + "epoch": 9.85, + "learning_rate": 7.396449704142012e-07, + "loss": 0.3508, + "step": 11655, + "task_loss": 0.511022686958313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4641001522541046, + "epoch": 9.85, + "learning_rate": 7.354184277261201e-07, + "loss": 0.3915, + "step": 11656, + "task_loss": 0.6546111106872559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2676555812358856, + "epoch": 9.85, + "learning_rate": 7.31191885038039e-07, + "loss": 0.3914, + "step": 11657, + "task_loss": 0.044949520379304886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5977452397346497, + "epoch": 9.85, + "learning_rate": 7.269653423499577e-07, + "loss": 0.4101, + "step": 11658, + "task_loss": 1.1131128072738647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23896104097366333, + "epoch": 9.85, + "learning_rate": 7.227387996618766e-07, + "loss": 0.4033, + "step": 11659, + "task_loss": 0.2637478709220886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2893185317516327, + "epoch": 9.86, + "learning_rate": 7.185122569737954e-07, + "loss": 0.4162, + "step": 11660, + "task_loss": 0.9951228499412537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35025647282600403, + "epoch": 9.86, + "learning_rate": 7.142857142857143e-07, + "loss": 0.2246, + "step": 11661, + "task_loss": 0.5603922605514526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37135398387908936, + "epoch": 9.86, + "learning_rate": 7.100591715976331e-07, + "loss": 0.2985, + "step": 11662, + "task_loss": 0.5651071667671204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30270451307296753, + "epoch": 9.86, + "learning_rate": 7.058326289095521e-07, + "loss": 0.3694, + "step": 11663, + "task_loss": 0.9176321029663086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3636479377746582, + "epoch": 9.86, + "learning_rate": 7.016060862214709e-07, + "loss": 0.3244, + "step": 11664, + "task_loss": 0.3930325508117676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3956620693206787, + "epoch": 9.86, + "learning_rate": 6.973795435333898e-07, + "loss": 0.4029, + "step": 11665, + "task_loss": 0.3240499496459961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5820457935333252, + "epoch": 9.86, + "learning_rate": 6.931530008453086e-07, + "loss": 0.545, + "step": 11666, + "task_loss": 1.336995244026184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27669885754585266, + "epoch": 9.86, + "learning_rate": 6.889264581572275e-07, + "loss": 0.2479, + "step": 11667, + "task_loss": 0.4336546063423157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43685275316238403, + "epoch": 9.86, + "learning_rate": 6.846999154691462e-07, + "loss": 0.4214, + "step": 11668, + "task_loss": 1.184675931930542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.48736968636512756, + "epoch": 9.86, + "learning_rate": 6.804733727810651e-07, + "loss": 0.3439, + "step": 11669, + "task_loss": 1.307713508605957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.7637746334075928, + "epoch": 9.86, + "learning_rate": 6.762468300929839e-07, + "loss": 0.496, + "step": 11670, + "task_loss": 0.8066900372505188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29835572838783264, + "epoch": 9.87, + "learning_rate": 6.720202874049028e-07, + "loss": 0.3217, + "step": 11671, + "task_loss": 0.35983768105506897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.21950377523899078, + "epoch": 9.87, + "learning_rate": 6.677937447168217e-07, + "loss": 0.3258, + "step": 11672, + "task_loss": 0.07857642322778702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32372206449508667, + "epoch": 9.87, + "learning_rate": 6.635672020287405e-07, + "loss": 0.2994, + "step": 11673, + "task_loss": 0.3583196699619293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30379703640937805, + "epoch": 9.87, + "learning_rate": 6.593406593406594e-07, + "loss": 0.382, + "step": 11674, + "task_loss": 1.4303418397903442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4729366898536682, + "epoch": 9.87, + "learning_rate": 6.551141166525782e-07, + "loss": 0.3884, + "step": 11675, + "task_loss": 1.2082672119140625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3671010136604309, + "epoch": 9.87, + "learning_rate": 6.50887573964497e-07, + "loss": 0.3819, + "step": 11676, + "task_loss": 1.9791260957717896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1895056962966919, + "epoch": 9.87, + "learning_rate": 6.466610312764158e-07, + "loss": 0.3216, + "step": 11677, + "task_loss": 0.01748405396938324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.16998028755187988, + "epoch": 9.87, + "learning_rate": 6.424344885883348e-07, + "loss": 0.2553, + "step": 11678, + "task_loss": 0.24657247960567474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23196865618228912, + "epoch": 9.87, + "learning_rate": 6.382079459002536e-07, + "loss": 0.2172, + "step": 11679, + "task_loss": 0.12969322502613068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42950764298439026, + "epoch": 9.87, + "learning_rate": 6.339814032121725e-07, + "loss": 0.5153, + "step": 11680, + "task_loss": 0.4721524119377136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2525715231895447, + "epoch": 9.87, + "learning_rate": 6.297548605240913e-07, + "loss": 0.3088, + "step": 11681, + "task_loss": 0.16983474791049957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37924784421920776, + "epoch": 9.87, + "learning_rate": 6.255283178360102e-07, + "loss": 0.3107, + "step": 11682, + "task_loss": 0.2876981496810913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1962471902370453, + "epoch": 9.88, + "learning_rate": 6.21301775147929e-07, + "loss": 0.2913, + "step": 11683, + "task_loss": 0.7153844237327576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37121933698654175, + "epoch": 9.88, + "learning_rate": 6.170752324598479e-07, + "loss": 0.349, + "step": 11684, + "task_loss": 0.7558860778808594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26068949699401855, + "epoch": 9.88, + "learning_rate": 6.128486897717667e-07, + "loss": 0.3951, + "step": 11685, + "task_loss": 0.35780027508735657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29396599531173706, + "epoch": 9.88, + "learning_rate": 6.086221470836857e-07, + "loss": 0.3015, + "step": 11686, + "task_loss": 0.35192227363586426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23912331461906433, + "epoch": 9.88, + "learning_rate": 6.043956043956044e-07, + "loss": 0.3171, + "step": 11687, + "task_loss": 0.19755369424819946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3885962963104248, + "epoch": 9.88, + "learning_rate": 6.001690617075233e-07, + "loss": 0.4163, + "step": 11688, + "task_loss": 1.0168933868408203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23741543292999268, + "epoch": 9.88, + "learning_rate": 5.959425190194421e-07, + "loss": 0.3953, + "step": 11689, + "task_loss": 0.17260943353176117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28203392028808594, + "epoch": 9.88, + "learning_rate": 5.91715976331361e-07, + "loss": 0.3141, + "step": 11690, + "task_loss": 0.1484801173210144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23727358877658844, + "epoch": 9.88, + "learning_rate": 5.874894336432798e-07, + "loss": 0.278, + "step": 11691, + "task_loss": 0.1387133151292801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19000782072544098, + "epoch": 9.88, + "learning_rate": 5.832628909551987e-07, + "loss": 0.3064, + "step": 11692, + "task_loss": 0.5097707509994507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39126887917518616, + "epoch": 9.88, + "learning_rate": 5.790363482671176e-07, + "loss": 0.2995, + "step": 11693, + "task_loss": 0.4464239478111267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2658774256706238, + "epoch": 9.88, + "learning_rate": 5.748098055790364e-07, + "loss": 0.343, + "step": 11694, + "task_loss": 0.34079092741012573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2129579782485962, + "epoch": 9.89, + "learning_rate": 5.705832628909553e-07, + "loss": 0.3307, + "step": 11695, + "task_loss": 0.29643699526786804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3026198148727417, + "epoch": 9.89, + "learning_rate": 5.66356720202874e-07, + "loss": 0.3112, + "step": 11696, + "task_loss": 0.20042301714420319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22726619243621826, + "epoch": 9.89, + "learning_rate": 5.621301775147929e-07, + "loss": 0.3357, + "step": 11697, + "task_loss": 0.12034071981906891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25201886892318726, + "epoch": 9.89, + "learning_rate": 5.579036348267117e-07, + "loss": 0.3915, + "step": 11698, + "task_loss": 0.8884305357933044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3317891061306, + "epoch": 9.89, + "learning_rate": 5.536770921386306e-07, + "loss": 0.3472, + "step": 11699, + "task_loss": 0.19472239911556244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.13104330003261566, + "epoch": 9.89, + "learning_rate": 5.494505494505495e-07, + "loss": 0.2407, + "step": 11700, + "task_loss": 0.30573517084121704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4251641631126404, + "epoch": 9.89, + "learning_rate": 5.452240067624684e-07, + "loss": 0.3792, + "step": 11701, + "task_loss": 2.264606237411499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2899189889431, + "epoch": 9.89, + "learning_rate": 5.409974640743872e-07, + "loss": 0.3221, + "step": 11702, + "task_loss": 0.7569632530212402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32380542159080505, + "epoch": 9.89, + "learning_rate": 5.367709213863061e-07, + "loss": 0.3383, + "step": 11703, + "task_loss": 0.39251238107681274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3275623917579651, + "epoch": 9.89, + "learning_rate": 5.325443786982249e-07, + "loss": 0.392, + "step": 11704, + "task_loss": 0.8659747242927551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3574153482913971, + "epoch": 9.89, + "learning_rate": 5.283178360101437e-07, + "loss": 0.4121, + "step": 11705, + "task_loss": 1.2058131694793701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26528435945510864, + "epoch": 9.89, + "learning_rate": 5.240912933220625e-07, + "loss": 0.3632, + "step": 11706, + "task_loss": 0.23518306016921997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3148086667060852, + "epoch": 9.9, + "learning_rate": 5.198647506339814e-07, + "loss": 0.3218, + "step": 11707, + "task_loss": 0.9215880036354065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2723386287689209, + "epoch": 9.9, + "learning_rate": 5.156382079459003e-07, + "loss": 0.318, + "step": 11708, + "task_loss": 1.4038259983062744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4142744243144989, + "epoch": 9.9, + "learning_rate": 5.114116652578192e-07, + "loss": 0.3177, + "step": 11709, + "task_loss": 0.6680419445037842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4938109517097473, + "epoch": 9.9, + "learning_rate": 5.07185122569738e-07, + "loss": 0.375, + "step": 11710, + "task_loss": 1.2160124778747559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4601837992668152, + "epoch": 9.9, + "learning_rate": 5.029585798816569e-07, + "loss": 0.3874, + "step": 11711, + "task_loss": 0.6291356086730957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1953631043434143, + "epoch": 9.9, + "learning_rate": 4.987320371935757e-07, + "loss": 0.2847, + "step": 11712, + "task_loss": 0.17378218472003937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3112950325012207, + "epoch": 9.9, + "learning_rate": 4.945054945054946e-07, + "loss": 0.4124, + "step": 11713, + "task_loss": 0.7834708094596863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5255270600318909, + "epoch": 9.9, + "learning_rate": 4.902789518174133e-07, + "loss": 0.3007, + "step": 11714, + "task_loss": 0.9972193241119385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4606904685497284, + "epoch": 9.9, + "learning_rate": 4.860524091293322e-07, + "loss": 0.3886, + "step": 11715, + "task_loss": 0.7609273195266724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26381057500839233, + "epoch": 9.9, + "learning_rate": 4.818258664412511e-07, + "loss": 0.3228, + "step": 11716, + "task_loss": 0.6803664565086365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42576780915260315, + "epoch": 9.9, + "learning_rate": 4.775993237531699e-07, + "loss": 0.5524, + "step": 11717, + "task_loss": 0.9163270592689514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.538731038570404, + "epoch": 9.9, + "learning_rate": 4.733727810650888e-07, + "loss": 0.3756, + "step": 11718, + "task_loss": 0.8445892930030823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5840818285942078, + "epoch": 9.91, + "learning_rate": 4.6914623837700764e-07, + "loss": 0.3939, + "step": 11719, + "task_loss": 0.45174503326416016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4116341173648834, + "epoch": 9.91, + "learning_rate": 4.649196956889265e-07, + "loss": 0.3107, + "step": 11720, + "task_loss": 0.8853473663330078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25085964798927307, + "epoch": 9.91, + "learning_rate": 4.606931530008453e-07, + "loss": 0.3378, + "step": 11721, + "task_loss": 1.6889536380767822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2072075754404068, + "epoch": 9.91, + "learning_rate": 4.5646661031276416e-07, + "loss": 0.3158, + "step": 11722, + "task_loss": 0.4609294533729553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1892644613981247, + "epoch": 9.91, + "learning_rate": 4.5224006762468305e-07, + "loss": 0.3881, + "step": 11723, + "task_loss": 0.45453980565071106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.681640088558197, + "epoch": 9.91, + "learning_rate": 4.480135249366019e-07, + "loss": 0.4691, + "step": 11724, + "task_loss": 1.1834304332733154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3957146406173706, + "epoch": 9.91, + "learning_rate": 4.4378698224852073e-07, + "loss": 0.3689, + "step": 11725, + "task_loss": 0.0429813526570797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4019812345504761, + "epoch": 9.91, + "learning_rate": 4.3956043956043957e-07, + "loss": 0.3422, + "step": 11726, + "task_loss": 0.31533750891685486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24700915813446045, + "epoch": 9.91, + "learning_rate": 4.353338968723584e-07, + "loss": 0.313, + "step": 11727, + "task_loss": 0.2257741540670395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18115797638893127, + "epoch": 9.91, + "learning_rate": 4.3110735418427724e-07, + "loss": 0.2315, + "step": 11728, + "task_loss": 0.8204843401908875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3415915071964264, + "epoch": 9.91, + "learning_rate": 4.268808114961961e-07, + "loss": 0.4687, + "step": 11729, + "task_loss": 0.8845313787460327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.27753379940986633, + "epoch": 9.91, + "learning_rate": 4.22654268808115e-07, + "loss": 0.3118, + "step": 11730, + "task_loss": 0.5955291390419006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.260311484336853, + "epoch": 9.92, + "learning_rate": 4.1842772612003386e-07, + "loss": 0.3838, + "step": 11731, + "task_loss": 0.7572807669639587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5177025198936462, + "epoch": 9.92, + "learning_rate": 4.142011834319527e-07, + "loss": 0.3524, + "step": 11732, + "task_loss": 1.8879550695419312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4819498062133789, + "epoch": 9.92, + "learning_rate": 4.0997464074387154e-07, + "loss": 0.3571, + "step": 11733, + "task_loss": 0.8797711133956909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39422139525413513, + "epoch": 9.92, + "learning_rate": 4.057480980557904e-07, + "loss": 0.3882, + "step": 11734, + "task_loss": 0.594251275062561 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4268626570701599, + "epoch": 9.92, + "learning_rate": 4.015215553677092e-07, + "loss": 0.3849, + "step": 11735, + "task_loss": 0.7167497873306274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.26411309838294983, + "epoch": 9.92, + "learning_rate": 3.9729501267962806e-07, + "loss": 0.3858, + "step": 11736, + "task_loss": 0.8943134546279907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4469754993915558, + "epoch": 9.92, + "learning_rate": 3.930684699915469e-07, + "loss": 0.3523, + "step": 11737, + "task_loss": 0.7392475008964539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38543856143951416, + "epoch": 9.92, + "learning_rate": 3.888419273034658e-07, + "loss": 0.3296, + "step": 11738, + "task_loss": 0.5616697072982788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25632214546203613, + "epoch": 9.92, + "learning_rate": 3.846153846153847e-07, + "loss": 0.4191, + "step": 11739, + "task_loss": 0.6170924305915833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35441380739212036, + "epoch": 9.92, + "learning_rate": 3.803888419273035e-07, + "loss": 0.4002, + "step": 11740, + "task_loss": 0.7470883727073669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33617663383483887, + "epoch": 9.92, + "learning_rate": 3.7616229923922236e-07, + "loss": 0.4149, + "step": 11741, + "task_loss": 0.5135720372200012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4026612937450409, + "epoch": 9.93, + "learning_rate": 3.719357565511412e-07, + "loss": 0.3361, + "step": 11742, + "task_loss": 0.6488247513771057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5324593782424927, + "epoch": 9.93, + "learning_rate": 3.6770921386306003e-07, + "loss": 0.3984, + "step": 11743, + "task_loss": 0.9223433136940002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5193485021591187, + "epoch": 9.93, + "learning_rate": 3.6348267117497887e-07, + "loss": 0.393, + "step": 11744, + "task_loss": 0.49762797355651855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5189992189407349, + "epoch": 9.93, + "learning_rate": 3.592561284868977e-07, + "loss": 0.4286, + "step": 11745, + "task_loss": 0.8544949889183044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47492849826812744, + "epoch": 9.93, + "learning_rate": 3.5502958579881655e-07, + "loss": 0.4421, + "step": 11746, + "task_loss": 1.631609320640564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5366489291191101, + "epoch": 9.93, + "learning_rate": 3.5080304311073544e-07, + "loss": 0.3366, + "step": 11747, + "task_loss": 0.6337089538574219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3371276259422302, + "epoch": 9.93, + "learning_rate": 3.465765004226543e-07, + "loss": 0.2871, + "step": 11748, + "task_loss": 0.32683488726615906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47234320640563965, + "epoch": 9.93, + "learning_rate": 3.423499577345731e-07, + "loss": 0.3862, + "step": 11749, + "task_loss": 1.876320242881775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32776206731796265, + "epoch": 9.93, + "learning_rate": 3.3812341504649196e-07, + "loss": 0.5036, + "step": 11750, + "task_loss": 1.546988606452942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3000595271587372, + "epoch": 9.93, + "learning_rate": 3.3389687235841085e-07, + "loss": 0.2902, + "step": 11751, + "task_loss": 0.8016992211341858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3162441849708557, + "epoch": 9.93, + "learning_rate": 3.296703296703297e-07, + "loss": 0.3835, + "step": 11752, + "task_loss": 0.3264237940311432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19355684518814087, + "epoch": 9.93, + "learning_rate": 3.254437869822485e-07, + "loss": 0.2912, + "step": 11753, + "task_loss": 0.45622894167900085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22515863180160522, + "epoch": 9.94, + "learning_rate": 3.212172442941674e-07, + "loss": 0.3743, + "step": 11754, + "task_loss": 0.43019795417785645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.39934051036834717, + "epoch": 9.94, + "learning_rate": 3.1699070160608626e-07, + "loss": 0.3113, + "step": 11755, + "task_loss": 0.30255988240242004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.42757365107536316, + "epoch": 9.94, + "learning_rate": 3.127641589180051e-07, + "loss": 0.5953, + "step": 11756, + "task_loss": 0.791983962059021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1849563866853714, + "epoch": 9.94, + "learning_rate": 3.0853761622992393e-07, + "loss": 0.3166, + "step": 11757, + "task_loss": 0.12404951453208923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3444669544696808, + "epoch": 9.94, + "learning_rate": 3.043110735418428e-07, + "loss": 0.3114, + "step": 11758, + "task_loss": 0.12442126870155334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2463337630033493, + "epoch": 9.94, + "learning_rate": 3.0008453085376166e-07, + "loss": 0.2859, + "step": 11759, + "task_loss": 0.6186110973358154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3415706157684326, + "epoch": 9.94, + "learning_rate": 2.958579881656805e-07, + "loss": 0.3007, + "step": 11760, + "task_loss": 0.5110601186752319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.37953025102615356, + "epoch": 9.94, + "learning_rate": 2.9163144547759934e-07, + "loss": 0.3598, + "step": 11761, + "task_loss": 0.5044890642166138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23354989290237427, + "epoch": 9.94, + "learning_rate": 2.874049027895182e-07, + "loss": 0.4289, + "step": 11762, + "task_loss": 0.28337883949279785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22724463045597076, + "epoch": 9.94, + "learning_rate": 2.83178360101437e-07, + "loss": 0.3825, + "step": 11763, + "task_loss": 0.4275471270084381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18522074818611145, + "epoch": 9.94, + "learning_rate": 2.7895181741335586e-07, + "loss": 0.3035, + "step": 11764, + "task_loss": 0.15577220916748047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2865057587623596, + "epoch": 9.94, + "learning_rate": 2.7472527472527475e-07, + "loss": 0.3349, + "step": 11765, + "task_loss": 0.5225082039833069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.38515332341194153, + "epoch": 9.95, + "learning_rate": 2.704987320371936e-07, + "loss": 0.3316, + "step": 11766, + "task_loss": 0.30551522970199585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4762231707572937, + "epoch": 9.95, + "learning_rate": 2.662721893491124e-07, + "loss": 0.3445, + "step": 11767, + "task_loss": 0.38365766406059265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25808727741241455, + "epoch": 9.95, + "learning_rate": 2.6204564666103126e-07, + "loss": 0.351, + "step": 11768, + "task_loss": 0.6372522711753845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.5280554294586182, + "epoch": 9.95, + "learning_rate": 2.5781910397295016e-07, + "loss": 0.383, + "step": 11769, + "task_loss": 0.7633196711540222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6621668338775635, + "epoch": 9.95, + "learning_rate": 2.53592561284869e-07, + "loss": 0.4276, + "step": 11770, + "task_loss": 0.4530010223388672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6073736548423767, + "epoch": 9.95, + "learning_rate": 2.4936601859678783e-07, + "loss": 0.3712, + "step": 11771, + "task_loss": 0.6752600073814392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28225088119506836, + "epoch": 9.95, + "learning_rate": 2.4513947590870667e-07, + "loss": 0.3297, + "step": 11772, + "task_loss": 1.1286174058914185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3454148769378662, + "epoch": 9.95, + "learning_rate": 2.4091293322062556e-07, + "loss": 0.3399, + "step": 11773, + "task_loss": 0.12636178731918335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3395894169807434, + "epoch": 9.95, + "learning_rate": 2.366863905325444e-07, + "loss": 0.3018, + "step": 11774, + "task_loss": 0.11742892116308212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2392624169588089, + "epoch": 9.95, + "learning_rate": 2.3245984784446324e-07, + "loss": 0.4079, + "step": 11775, + "task_loss": 0.31141945719718933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3980900049209595, + "epoch": 9.95, + "learning_rate": 2.2823330515638208e-07, + "loss": 0.366, + "step": 11776, + "task_loss": 0.8269650340080261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3302658200263977, + "epoch": 9.95, + "learning_rate": 2.2400676246830094e-07, + "loss": 0.3754, + "step": 11777, + "task_loss": 0.7397379875183105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3014085590839386, + "epoch": 9.96, + "learning_rate": 2.1978021978021978e-07, + "loss": 0.3202, + "step": 11778, + "task_loss": 0.09280461072921753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.49418768286705017, + "epoch": 9.96, + "learning_rate": 2.1555367709213862e-07, + "loss": 0.3977, + "step": 11779, + "task_loss": 0.751481831073761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.47034943103790283, + "epoch": 9.96, + "learning_rate": 2.113271344040575e-07, + "loss": 0.4858, + "step": 11780, + "task_loss": 0.967939019203186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2273130714893341, + "epoch": 9.96, + "learning_rate": 2.0710059171597635e-07, + "loss": 0.2863, + "step": 11781, + "task_loss": 0.4272995591163635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34906578063964844, + "epoch": 9.96, + "learning_rate": 2.028740490278952e-07, + "loss": 0.3483, + "step": 11782, + "task_loss": 0.7774043679237366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2934296727180481, + "epoch": 9.96, + "learning_rate": 1.9864750633981403e-07, + "loss": 0.3631, + "step": 11783, + "task_loss": 0.3624795973300934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.282190203666687, + "epoch": 9.96, + "learning_rate": 1.944209636517329e-07, + "loss": 0.3786, + "step": 11784, + "task_loss": 0.32133761048316956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.34569019079208374, + "epoch": 9.96, + "learning_rate": 1.9019442096365176e-07, + "loss": 0.4441, + "step": 11785, + "task_loss": 0.4626517593860626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32801681756973267, + "epoch": 9.96, + "learning_rate": 1.859678782755706e-07, + "loss": 0.2539, + "step": 11786, + "task_loss": 0.06652214378118515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.24997010827064514, + "epoch": 9.96, + "learning_rate": 1.8174133558748944e-07, + "loss": 0.3825, + "step": 11787, + "task_loss": 0.08898423612117767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.20445089042186737, + "epoch": 9.96, + "learning_rate": 1.7751479289940827e-07, + "loss": 0.3181, + "step": 11788, + "task_loss": 0.5703615546226501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.40065449476242065, + "epoch": 9.96, + "learning_rate": 1.7328825021132714e-07, + "loss": 0.3168, + "step": 11789, + "task_loss": 0.40044814348220825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.322534441947937, + "epoch": 9.97, + "learning_rate": 1.6906170752324598e-07, + "loss": 0.3783, + "step": 11790, + "task_loss": 0.30410873889923096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.17657209932804108, + "epoch": 9.97, + "learning_rate": 1.6483516483516484e-07, + "loss": 0.2241, + "step": 11791, + "task_loss": 0.3425569534301758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.25555211305618286, + "epoch": 9.97, + "learning_rate": 1.606086221470837e-07, + "loss": 0.356, + "step": 11792, + "task_loss": 0.8189453482627869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.35348495841026306, + "epoch": 9.97, + "learning_rate": 1.5638207945900255e-07, + "loss": 0.3425, + "step": 11793, + "task_loss": 0.4430040419101715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3643072843551636, + "epoch": 9.97, + "learning_rate": 1.521555367709214e-07, + "loss": 0.2781, + "step": 11794, + "task_loss": 0.6984408497810364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.33921003341674805, + "epoch": 9.97, + "learning_rate": 1.4792899408284025e-07, + "loss": 0.4425, + "step": 11795, + "task_loss": 0.9837867617607117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2672138214111328, + "epoch": 9.97, + "learning_rate": 1.437024513947591e-07, + "loss": 0.3386, + "step": 11796, + "task_loss": 0.6427420973777771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3433264493942261, + "epoch": 9.97, + "learning_rate": 1.3947590870667793e-07, + "loss": 0.3824, + "step": 11797, + "task_loss": 0.26729127764701843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2660945951938629, + "epoch": 9.97, + "learning_rate": 1.352493660185968e-07, + "loss": 0.3078, + "step": 11798, + "task_loss": 0.5148373246192932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.19694188237190247, + "epoch": 9.97, + "learning_rate": 1.3102282333051563e-07, + "loss": 0.3864, + "step": 11799, + "task_loss": 0.6460482478141785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.18113857507705688, + "epoch": 9.97, + "learning_rate": 1.267962806424345e-07, + "loss": 0.2952, + "step": 11800, + "task_loss": 0.2964319884777069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2834235429763794, + "epoch": 9.97, + "learning_rate": 1.2256973795435334e-07, + "loss": 0.373, + "step": 11801, + "task_loss": 0.7258270382881165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43097978830337524, + "epoch": 9.98, + "learning_rate": 1.183431952662722e-07, + "loss": 0.2945, + "step": 11802, + "task_loss": 0.7247740626335144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23921102285385132, + "epoch": 9.98, + "learning_rate": 1.1411665257819104e-07, + "loss": 0.3422, + "step": 11803, + "task_loss": 0.47640129923820496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3541773855686188, + "epoch": 9.98, + "learning_rate": 1.0989010989010989e-07, + "loss": 0.3429, + "step": 11804, + "task_loss": 0.5667593479156494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2955644130706787, + "epoch": 9.98, + "learning_rate": 1.0566356720202876e-07, + "loss": 0.3195, + "step": 11805, + "task_loss": 0.11272616684436798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4857930839061737, + "epoch": 9.98, + "learning_rate": 1.014370245139476e-07, + "loss": 0.3202, + "step": 11806, + "task_loss": 0.30028092861175537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3765583634376526, + "epoch": 9.98, + "learning_rate": 9.721048182586645e-08, + "loss": 0.3557, + "step": 11807, + "task_loss": 0.46042531728744507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2766381502151489, + "epoch": 9.98, + "learning_rate": 9.29839391377853e-08, + "loss": 0.3689, + "step": 11808, + "task_loss": 0.3302498757839203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.43337833881378174, + "epoch": 9.98, + "learning_rate": 8.875739644970414e-08, + "loss": 0.3686, + "step": 11809, + "task_loss": 0.6017572283744812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3352062702178955, + "epoch": 9.98, + "learning_rate": 8.453085376162299e-08, + "loss": 0.5057, + "step": 11810, + "task_loss": 0.4864320158958435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.346771776676178, + "epoch": 9.98, + "learning_rate": 8.030431107354185e-08, + "loss": 0.3923, + "step": 11811, + "task_loss": 1.0557861328125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2765885293483734, + "epoch": 9.98, + "learning_rate": 7.60777683854607e-08, + "loss": 0.51, + "step": 11812, + "task_loss": 0.6310605406761169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.28861021995544434, + "epoch": 9.99, + "learning_rate": 7.185122569737954e-08, + "loss": 0.3857, + "step": 11813, + "task_loss": 0.6160259246826172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.30933618545532227, + "epoch": 9.99, + "learning_rate": 6.76246830092984e-08, + "loss": 0.2909, + "step": 11814, + "task_loss": 0.21384865045547485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36904263496398926, + "epoch": 9.99, + "learning_rate": 6.339814032121725e-08, + "loss": 0.4287, + "step": 11815, + "task_loss": 0.48181653022766113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.6379642486572266, + "epoch": 9.99, + "learning_rate": 5.91715976331361e-08, + "loss": 0.442, + "step": 11816, + "task_loss": 1.2292546033859253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.416140615940094, + "epoch": 9.99, + "learning_rate": 5.4945054945054946e-08, + "loss": 0.4102, + "step": 11817, + "task_loss": 0.4823341369628906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3310478925704956, + "epoch": 9.99, + "learning_rate": 5.07185122569738e-08, + "loss": 0.3042, + "step": 11818, + "task_loss": 0.41227760910987854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.22006428241729736, + "epoch": 9.99, + "learning_rate": 4.649196956889265e-08, + "loss": 0.3315, + "step": 11819, + "task_loss": 0.05721826106309891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.1765422523021698, + "epoch": 9.99, + "learning_rate": 4.2265426880811495e-08, + "loss": 0.3644, + "step": 11820, + "task_loss": 1.6966221332550049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.36882930994033813, + "epoch": 9.99, + "learning_rate": 3.803888419273035e-08, + "loss": 0.3452, + "step": 11821, + "task_loss": 0.7885497212409973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.4522852897644043, + "epoch": 9.99, + "learning_rate": 3.38123415046492e-08, + "loss": 0.3788, + "step": 11822, + "task_loss": 0.9584880471229553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.29514628648757935, + "epoch": 9.99, + "learning_rate": 2.958579881656805e-08, + "loss": 0.3308, + "step": 11823, + "task_loss": 0.020527873188257217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3450312614440918, + "epoch": 9.99, + "learning_rate": 2.53592561284869e-08, + "loss": 0.3556, + "step": 11824, + "task_loss": 0.8504543304443359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2872273027896881, + "epoch": 10.0, + "learning_rate": 2.1132713440405747e-08, + "loss": 0.3119, + "step": 11825, + "task_loss": 0.4323715269565582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2442912757396698, + "epoch": 10.0, + "learning_rate": 1.69061707523246e-08, + "loss": 0.3689, + "step": 11826, + "task_loss": 0.8550302982330322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.32440152764320374, + "epoch": 10.0, + "learning_rate": 1.267962806424345e-08, + "loss": 0.4003, + "step": 11827, + "task_loss": 0.4759567379951477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.3857249617576599, + "epoch": 10.0, + "learning_rate": 8.4530853761623e-09, + "loss": 0.405, + "step": 11828, + "task_loss": 0.8508530259132385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.2739834785461426, + "epoch": 10.0, + "learning_rate": 4.22654268808115e-09, + "loss": 0.285, + "step": 11829, + "task_loss": 0.9523541331291199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.38010914810693824, + "compression/movement_sparsity/model_sparsity": 0.3670512295169506, + "compression_loss": 0.0, + "distillation_loss": 0.23988965153694153, + "epoch": 10.0, + "learning_rate": 0.0, + "loss": 0.3541, + "step": 11830, + "task_loss": 1.0655288696289062 + }, + { + "epoch": 10.0, + "step": 11830, + "total_flos": 5.9664632082415714e+19, + "train_loss": 12.959198828177591, + "train_runtime": 51238.186, + "train_samples_per_second": 14.784, + "train_steps_per_second": 0.231 + } + ], + "max_steps": 11830, + "num_train_epochs": 10, + "total_flos": 5.9664632082415714e+19, + "trial_name": null, + "trial_params": null +}