{ "best_metric": 0.9794057075610474, "best_model_checkpoint": "/nvme2/yujiepan/workspace/jpqd-test/playground/optimum-playground/0314.example-rerun/logs/w2v2-ks-jpqd-quant-FE-finetuned-student/checkpoint-4788", "epoch": 11.999373825923607, "global_step": 4788, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 2.51446533203125, "epoch": 0.03, "learning_rate": 8.771929824561403e-08, "loss": 0.9515, "step": 10, "task_loss": 0.9478130340576172 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 2.1920084953308105, "epoch": 0.05, "learning_rate": 3.2163742690058475e-07, "loss": 0.9499, "step": 20, "task_loss": 1.1256299018859863 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 1.5554466247558594, "epoch": 0.08, "learning_rate": 6.140350877192981e-07, "loss": 0.8458, "step": 30, "task_loss": 0.8432578444480896 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 1.1300909519195557, "epoch": 0.1, "learning_rate": 8.771929824561403e-07, "loss": 0.7546, "step": 40, "task_loss": 0.6580129861831665 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 1.2838783264160156, "epoch": 0.13, "learning_rate": 1.1695906432748535e-06, "loss": 0.7662, "step": 50, "task_loss": 0.6450424194335938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.9236965775489807, "epoch": 0.15, "learning_rate": 1.4619883040935671e-06, "loss": 0.6895, "step": 60, "task_loss": 0.4147263765335083 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 1.3553576469421387, "epoch": 0.18, "learning_rate": 1.7543859649122805e-06, "loss": 0.5607, "step": 70, "task_loss": 0.5525619983673096 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 1.6165952682495117, "epoch": 0.2, "learning_rate": 2.046783625730994e-06, "loss": 0.6068, "step": 80, "task_loss": 0.8984407782554626 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 1.6254358291625977, "epoch": 0.23, "learning_rate": 2.339181286549707e-06, "loss": 0.5856, "step": 90, "task_loss": 0.8998498916625977 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.6450754404067993, "epoch": 0.25, "learning_rate": 2.6315789473684207e-06, "loss": 0.6321, "step": 100, "task_loss": 0.2010144591331482 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.7683451175689697, "epoch": 0.28, "learning_rate": 2.9239766081871343e-06, "loss": 0.5395, "step": 110, "task_loss": 0.3388332724571228 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.49454671144485474, "epoch": 0.3, "learning_rate": 3.2163742690058475e-06, "loss": 0.5699, "step": 120, "task_loss": 0.35037726163864136 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.6786686182022095, "epoch": 0.33, "learning_rate": 3.508771929824561e-06, "loss": 0.559, "step": 130, "task_loss": 0.510002076625824 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.5671663284301758, "epoch": 0.35, "learning_rate": 3.8011695906432742e-06, "loss": 0.5609, "step": 140, "task_loss": 0.29252851009368896 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.5783287882804871, "epoch": 0.38, "learning_rate": 4.093567251461988e-06, "loss": 0.4257, "step": 150, "task_loss": 0.32213056087493896 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.8722546100616455, "epoch": 0.4, "learning_rate": 4.3859649122807014e-06, "loss": 0.4616, "step": 160, "task_loss": 0.5493186116218567 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.5747439861297607, "epoch": 0.43, "learning_rate": 4.678362573099414e-06, "loss": 0.459, "step": 170, "task_loss": 0.26550352573394775 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.16851046681404114, "epoch": 0.45, "learning_rate": 4.970760233918128e-06, "loss": 0.4353, "step": 180, "task_loss": 0.08923470973968506 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 1.2978078126907349, "epoch": 0.48, "learning_rate": 5.263157894736841e-06, "loss": 0.4977, "step": 190, "task_loss": 0.817858099937439 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.8489176034927368, "epoch": 0.5, "learning_rate": 5.555555555555555e-06, "loss": 0.5026, "step": 200, "task_loss": 0.4384632706642151 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.7206552028656006, "epoch": 0.53, "learning_rate": 5.8479532163742686e-06, "loss": 0.4572, "step": 210, "task_loss": 0.3212318420410156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.8086921572685242, "epoch": 0.55, "learning_rate": 6.140350877192981e-06, "loss": 0.4925, "step": 220, "task_loss": 0.447459876537323 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.8657560348510742, "epoch": 0.58, "learning_rate": 6.432748538011695e-06, "loss": 0.4788, "step": 230, "task_loss": 0.5898361206054688 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.5544594526290894, "epoch": 0.6, "learning_rate": 6.7251461988304085e-06, "loss": 0.442, "step": 240, "task_loss": 0.34924864768981934 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.7391929626464844, "epoch": 0.63, "learning_rate": 7.017543859649122e-06, "loss": 0.4101, "step": 250, "task_loss": 0.40177232027053833 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.812840461730957, "epoch": 0.65, "learning_rate": 7.309941520467835e-06, "loss": 0.4813, "step": 260, "task_loss": 0.5525591373443604 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 1.3050377368927002, "epoch": 0.68, "learning_rate": 7.6023391812865485e-06, "loss": 0.4787, "step": 270, "task_loss": 0.9039720892906189 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.47748178243637085, "epoch": 0.7, "learning_rate": 7.894736842105261e-06, "loss": 0.412, "step": 280, "task_loss": 0.18844187259674072 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.7705972194671631, "epoch": 0.73, "learning_rate": 8.187134502923976e-06, "loss": 0.4226, "step": 290, "task_loss": 0.4612080454826355 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.4262794852256775, "epoch": 0.75, "learning_rate": 8.479532163742688e-06, "loss": 0.4198, "step": 300, "task_loss": 0.3151981234550476 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.5648545026779175, "epoch": 0.78, "learning_rate": 8.771929824561403e-06, "loss": 0.3941, "step": 310, "task_loss": 0.4206511378288269 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.44185498356819153, "epoch": 0.8, "learning_rate": 9.064327485380116e-06, "loss": 0.4256, "step": 320, "task_loss": 0.24285316467285156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.5537223815917969, "epoch": 0.83, "learning_rate": 9.356725146198828e-06, "loss": 0.4058, "step": 330, "task_loss": 0.21424394845962524 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.45310112833976746, "epoch": 0.85, "learning_rate": 9.649122807017543e-06, "loss": 0.3847, "step": 340, "task_loss": 0.16723406314849854 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.8928914070129395, "epoch": 0.88, "learning_rate": 9.941520467836256e-06, "loss": 0.353, "step": 350, "task_loss": 0.5004438161849976 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 1.086942434310913, "epoch": 0.9, "learning_rate": 1.023391812865497e-05, "loss": 0.3981, "step": 360, "task_loss": 0.4962713122367859 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.283523827791214, "epoch": 0.93, "learning_rate": 1.0526315789473683e-05, "loss": 0.3131, "step": 370, "task_loss": 0.3023862838745117 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.72919762134552, "epoch": 0.95, "learning_rate": 1.0818713450292396e-05, "loss": 0.3519, "step": 380, "task_loss": 0.3908747434616089 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 0.3192288875579834, "epoch": 0.98, "learning_rate": 1.111111111111111e-05, "loss": 0.3477, "step": 390, "task_loss": 0.28754663467407227 }, { "epoch": 1.0, "eval_accuracy": 0.9636657840541336, "eval_loss": 0.15156690776348114, "eval_runtime": 116.1773, "eval_samples_per_second": 58.514, "eval_steps_per_second": 1.833, "step": 399 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -0.0003772132040467113, "compression/movement_sparsity/linear_layer_sparsity": 0.0010102966915085817, "compression/movement_sparsity/model_sparsity": 0.0009082319164113224, "compression_loss": 0.0, "distillation_loss": 0.35058850049972534, "epoch": 1.0, "learning_rate": 1.1403508771929823e-05, "loss": 0.3485, "step": 400, "task_loss": 0.1688241958618164 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0005970049999999993, "compression/movement_sparsity/importance_threshold": -0.00037158324982466365, "compression/movement_sparsity/linear_layer_sparsity": 0.0012727599932249322, "compression/movement_sparsity/model_sparsity": 0.0011441799794991438, "compression_loss": 0.16240963339805603, "distillation_loss": 0.4725266396999359, "epoch": 1.03, "learning_rate": 1.1695906432748537e-05, "loss": 0.433, "step": 410, "task_loss": 0.288402259349823 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0011880399999999991, "compression/movement_sparsity/importance_threshold": -0.00036600959467331996, "compression/movement_sparsity/linear_layer_sparsity": 0.0019467616117133392, "compression/movement_sparsity/model_sparsity": 0.001750090883463397, "compression_loss": 0.32319432497024536, "distillation_loss": 0.3679881989955902, "epoch": 1.05, "learning_rate": 1.198830409356725e-05, "loss": 0.5607, "step": 420, "task_loss": 0.1445859670639038 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0017731350000000036, "compression/movement_sparsity/importance_threshold": -0.00036049195568277717, "compression/movement_sparsity/linear_layer_sparsity": 0.0026285851400180667, "compression/movement_sparsity/model_sparsity": 0.0023630334922745354, "compression_loss": 0.4823610484600067, "distillation_loss": 0.6154017448425293, "epoch": 1.08, "learning_rate": 1.2280701754385963e-05, "loss": 0.7495, "step": 430, "task_loss": 0.3742516040802002 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0023523200000000032, "compression/movement_sparsity/importance_threshold": -0.0003550300499431323, "compression/movement_sparsity/linear_layer_sparsity": 0.0037378260501355013, "compression/movement_sparsity/model_sparsity": 0.003360213831500896, "compression_loss": 0.6399164199829102, "distillation_loss": 0.7374873757362366, "epoch": 1.1, "learning_rate": 1.2573099415204677e-05, "loss": 0.9126, "step": 440, "task_loss": 0.37421369552612305 }, { "compression/movement_sparsity/importance_regularization_factor": 0.002925625000000003, "compression/movement_sparsity/importance_threshold": -0.0003496235945444823, "compression/movement_sparsity/linear_layer_sparsity": 0.00737426132941885, "compression/movement_sparsity/model_sparsity": 0.0066292798497985545, "compression_loss": 0.7958664298057556, "distillation_loss": 0.6916153430938721, "epoch": 1.13, "learning_rate": 1.286549707602339e-05, "loss": 1.1032, "step": 450, "task_loss": 0.35443025827407837 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0034930800000000017, "compression/movement_sparsity/importance_threshold": -0.0003442723065769242, "compression/movement_sparsity/linear_layer_sparsity": 0.014844843891147244, "compression/movement_sparsity/model_sparsity": 0.013345150122140185, "compression_loss": 0.9502197504043579, "distillation_loss": 0.25516602396965027, "epoch": 1.15, "learning_rate": 1.3157894736842103e-05, "loss": 1.2696, "step": 460, "task_loss": 0.12397211790084839 }, { "compression/movement_sparsity/importance_regularization_factor": 0.004054715000000005, "compression/movement_sparsity/importance_threshold": -0.00033897590313055476, "compression/movement_sparsity/linear_layer_sparsity": 0.025417325353809094, "compression/movement_sparsity/model_sparsity": 0.0228495513349348, "compression_loss": 1.102981686592102, "distillation_loss": 0.7275341749191284, "epoch": 1.18, "learning_rate": 1.3450292397660817e-05, "loss": 1.4399, "step": 470, "task_loss": 0.28879886865615845 }, { "compression/movement_sparsity/importance_regularization_factor": 0.004610560000000006, "compression/movement_sparsity/importance_threshold": -0.0003337341012954711, "compression/movement_sparsity/linear_layer_sparsity": 0.03841998597937368, "compression/movement_sparsity/model_sparsity": 0.03453862393871483, "compression_loss": 1.2541608810424805, "distillation_loss": 0.3953809142112732, "epoch": 1.2, "learning_rate": 1.374269005847953e-05, "loss": 1.5474, "step": 480, "task_loss": 0.30661720037460327 }, { "compression/movement_sparsity/importance_regularization_factor": 0.005160645000000006, "compression/movement_sparsity/importance_threshold": -0.00032854661816177023, "compression/movement_sparsity/linear_layer_sparsity": 0.05003314607798856, "compression/movement_sparsity/model_sparsity": 0.04497856968990499, "compression_loss": 1.403770089149475, "distillation_loss": 0.14313216507434845, "epoch": 1.23, "learning_rate": 1.4035087719298244e-05, "loss": 1.7254, "step": 490, "task_loss": 0.15733814239501953 }, { "compression/movement_sparsity/importance_regularization_factor": 0.005705000000000004, "compression/movement_sparsity/importance_threshold": -0.0003234131708195491, "compression/movement_sparsity/linear_layer_sparsity": 0.060780132584311954, "compression/movement_sparsity/model_sparsity": 0.05463984665173478, "compression_loss": 1.5518090724945068, "distillation_loss": 0.3677009344100952, "epoch": 1.25, "learning_rate": 1.4327485380116957e-05, "loss": 1.8058, "step": 500, "task_loss": 0.3658151626586914 }, { "compression/movement_sparsity/importance_regularization_factor": 0.006243655000000006, "compression/movement_sparsity/importance_threshold": -0.00031833347635890454, "compression/movement_sparsity/linear_layer_sparsity": 0.07141967075052695, "compression/movement_sparsity/model_sparsity": 0.06420453019435209, "compression_loss": 1.698294758796692, "distillation_loss": 0.1806531548500061, "epoch": 1.28, "learning_rate": 1.461988304093567e-05, "loss": 2.0346, "step": 510, "task_loss": 0.17051905393600464 }, { "compression/movement_sparsity/importance_regularization_factor": 0.006776640000000005, "compression/movement_sparsity/importance_threshold": -0.00031330725186993363, "compression/movement_sparsity/linear_layer_sparsity": 0.08092245605615779, "compression/movement_sparsity/model_sparsity": 0.07274730082986806, "compression_loss": 1.8432351350784302, "distillation_loss": 0.8070929050445557, "epoch": 1.3, "learning_rate": 1.4912280701754384e-05, "loss": 2.1974, "step": 520, "task_loss": 0.42286455631256104 }, { "compression/movement_sparsity/importance_regularization_factor": 0.007303984999999993, "compression/movement_sparsity/importance_threshold": -0.00030833421444273343, "compression/movement_sparsity/linear_layer_sparsity": 0.09169531955736224, "compression/movement_sparsity/model_sparsity": 0.08243184057464996, "compression_loss": 1.9866251945495605, "distillation_loss": 0.5600777864456177, "epoch": 1.33, "learning_rate": 1.5204678362573097e-05, "loss": 2.2739, "step": 530, "task_loss": 0.1994045376777649 }, { "compression/movement_sparsity/importance_regularization_factor": 0.007825719999999996, "compression/movement_sparsity/importance_threshold": -0.0003034140811674006, "compression/movement_sparsity/linear_layer_sparsity": 0.102509927356218, "compression/movement_sparsity/model_sparsity": 0.09215390741793049, "compression_loss": 2.1284844875335693, "distillation_loss": 0.6648948192596436, "epoch": 1.35, "learning_rate": 1.549707602339181e-05, "loss": 2.4111, "step": 540, "task_loss": 0.397721529006958 }, { "compression/movement_sparsity/importance_regularization_factor": 0.008341874999999997, "compression/movement_sparsity/importance_threshold": -0.00029854656913403234, "compression/movement_sparsity/linear_layer_sparsity": 0.11246536011367059, "compression/movement_sparsity/model_sparsity": 0.10110359699723998, "compression_loss": 2.268819808959961, "distillation_loss": 0.5570085644721985, "epoch": 1.38, "learning_rate": 1.5789473684210522e-05, "loss": 2.5594, "step": 550, "task_loss": 0.24902677536010742 }, { "compression/movement_sparsity/importance_regularization_factor": 0.008852479999999998, "compression/movement_sparsity/importance_threshold": -0.00029373139543272556, "compression/movement_sparsity/linear_layer_sparsity": 0.12301906899277326, "compression/movement_sparsity/model_sparsity": 0.11059112211840207, "compression_loss": 2.4076294898986816, "distillation_loss": 1.1394556760787964, "epoch": 1.4, "learning_rate": 1.608187134502924e-05, "loss": 2.661, "step": 560, "task_loss": 0.47712135314941406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009357564999999997, "compression/movement_sparsity/importance_threshold": -0.00028896827715357723, "compression/movement_sparsity/linear_layer_sparsity": 0.1357961175097862, "compression/movement_sparsity/model_sparsity": 0.12207737497681652, "compression_loss": 2.54492449760437, "distillation_loss": 0.7886945009231567, "epoch": 1.43, "learning_rate": 1.637426900584795e-05, "loss": 2.8728, "step": 570, "task_loss": 0.5190638899803162 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009857159999999997, "compression/movement_sparsity/importance_threshold": -0.00028425693138668434, "compression/movement_sparsity/linear_layer_sparsity": 0.1511111816847335, "compression/movement_sparsity/model_sparsity": 0.13584524158717315, "compression_loss": 2.6807024478912354, "distillation_loss": 0.5934693813323975, "epoch": 1.45, "learning_rate": 1.6666666666666664e-05, "loss": 3.0271, "step": 580, "task_loss": 0.43770378828048706 }, { "compression/movement_sparsity/importance_regularization_factor": 0.010351294999999996, "compression/movement_sparsity/importance_threshold": -0.0002795970752221438, "compression/movement_sparsity/linear_layer_sparsity": 0.17006478658536586, "compression/movement_sparsity/model_sparsity": 0.15288406696044038, "compression_loss": 2.814985752105713, "distillation_loss": 0.4786364436149597, "epoch": 1.48, "learning_rate": 1.6959064327485377e-05, "loss": 3.2229, "step": 590, "task_loss": 0.19185221195220947 }, { "compression/movement_sparsity/importance_regularization_factor": 0.010839999999999997, "compression/movement_sparsity/importance_threshold": -0.0002749884257500526, "compression/movement_sparsity/linear_layer_sparsity": 0.18809244203553147, "compression/movement_sparsity/model_sparsity": 0.16909048651572786, "compression_loss": 2.9477694034576416, "distillation_loss": 0.6082563400268555, "epoch": 1.5, "learning_rate": 1.725146198830409e-05, "loss": 3.4037, "step": 600, "task_loss": 0.20349204540252686 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011323304999999997, "compression/movement_sparsity/importance_threshold": -0.0002704307000605077, "compression/movement_sparsity/linear_layer_sparsity": 0.20836323302469137, "compression/movement_sparsity/model_sparsity": 0.18731342983722607, "compression_loss": 3.0790719985961914, "distillation_loss": 1.0272729396820068, "epoch": 1.53, "learning_rate": 1.7543859649122806e-05, "loss": 3.4945, "step": 610, "task_loss": 0.4551582932472229 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011801239999999998, "compression/movement_sparsity/importance_threshold": -0.00026592361524360606, "compression/movement_sparsity/linear_layer_sparsity": 0.22696513427807888, "compression/movement_sparsity/model_sparsity": 0.20403608226819747, "compression_loss": 3.2089145183563232, "distillation_loss": 1.0786527395248413, "epoch": 1.55, "learning_rate": 1.783625730994152e-05, "loss": 3.7627, "step": 620, "task_loss": 0.6676000356674194 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012273834999999997, "compression/movement_sparsity/importance_threshold": -0.00026146688838944466, "compression/movement_sparsity/linear_layer_sparsity": 0.24483187010689553, "compression/movement_sparsity/model_sparsity": 0.22009783903549965, "compression_loss": 3.337289333343506, "distillation_loss": 0.9446437358856201, "epoch": 1.58, "learning_rate": 1.812865497076023e-05, "loss": 3.8192, "step": 630, "task_loss": 0.5477078557014465 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012741120000000002, "compression/movement_sparsity/importance_threshold": -0.00025706023658812044, "compression/movement_sparsity/linear_layer_sparsity": 0.26339174476814214, "compression/movement_sparsity/model_sparsity": 0.23678271059215836, "compression_loss": 3.4642162322998047, "distillation_loss": 0.8719321489334106, "epoch": 1.6, "learning_rate": 1.8421052631578944e-05, "loss": 3.9186, "step": 640, "task_loss": 0.6353597044944763 }, { "compression/movement_sparsity/importance_regularization_factor": 0.013203125, "compression/movement_sparsity/importance_threshold": -0.00025270337692973044, "compression/movement_sparsity/linear_layer_sparsity": 0.2798236835666968, "compression/movement_sparsity/model_sparsity": 0.25155462006271223, "compression_loss": 3.58969783782959, "distillation_loss": 1.3224390745162964, "epoch": 1.63, "learning_rate": 1.8713450292397657e-05, "loss": 3.9562, "step": 650, "task_loss": 0.880706250667572 }, { "compression/movement_sparsity/importance_regularization_factor": 0.013659880000000003, "compression/movement_sparsity/importance_threshold": -0.0002483960265043715, "compression/movement_sparsity/linear_layer_sparsity": 0.29560894450090336, "compression/movement_sparsity/model_sparsity": 0.26574518201330083, "compression_loss": 3.713754892349243, "distillation_loss": 0.628543496131897, "epoch": 1.65, "learning_rate": 1.9005847953216373e-05, "loss": 4.1618, "step": 660, "task_loss": 0.21685028076171875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014111415000000002, "compression/movement_sparsity/importance_threshold": -0.00024413790240214073, "compression/movement_sparsity/linear_layer_sparsity": 0.3098370102190605, "compression/movement_sparsity/model_sparsity": 0.27853586370377764, "compression_loss": 3.836373805999756, "distillation_loss": 0.18178164958953857, "epoch": 1.68, "learning_rate": 1.9298245614035086e-05, "loss": 4.2838, "step": 670, "task_loss": 0.11985671520233154 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014557760000000003, "compression/movement_sparsity/importance_threshold": -0.000239928721713135, "compression/movement_sparsity/linear_layer_sparsity": 0.32408217827838, "compression/movement_sparsity/model_sparsity": 0.29134191997898734, "compression_loss": 3.957580804824829, "distillation_loss": 0.7838168144226074, "epoch": 1.7, "learning_rate": 1.95906432748538e-05, "loss": 4.3676, "step": 680, "task_loss": 0.4651721715927124 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014998945000000003, "compression/movement_sparsity/importance_threshold": -0.00023576820152745127, "compression/movement_sparsity/linear_layer_sparsity": 0.33866125366982835, "compression/movement_sparsity/model_sparsity": 0.3044481507462172, "compression_loss": 4.077360153198242, "distillation_loss": 0.8586763143539429, "epoch": 1.73, "learning_rate": 1.988304093567251e-05, "loss": 4.5084, "step": 690, "task_loss": 0.6287266612052917 }, { "compression/movement_sparsity/importance_regularization_factor": 0.015435000000000004, "compression/movement_sparsity/importance_threshold": -0.00023165605893518657, "compression/movement_sparsity/linear_layer_sparsity": 0.3526366422952424, "compression/movement_sparsity/model_sparsity": 0.317011681935159, "compression_loss": 4.195735931396484, "distillation_loss": 2.807767391204834, "epoch": 1.75, "learning_rate": 2.0175438596491224e-05, "loss": 4.605, "step": 700, "task_loss": 1.069559097290039 }, { "compression/movement_sparsity/importance_regularization_factor": 0.015865955, "compression/movement_sparsity/importance_threshold": -0.00022759201102643783, "compression/movement_sparsity/linear_layer_sparsity": 0.36448323641222524, "compression/movement_sparsity/model_sparsity": 0.32766147913656163, "compression_loss": 4.312725067138672, "distillation_loss": 0.7329788208007812, "epoch": 1.78, "learning_rate": 2.046783625730994e-05, "loss": 4.7339, "step": 710, "task_loss": 0.3494850993156433 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016291840000000005, "compression/movement_sparsity/importance_threshold": -0.00022357577489130195, "compression/movement_sparsity/linear_layer_sparsity": 0.3769337760651912, "compression/movement_sparsity/model_sparsity": 0.3388542085440818, "compression_loss": 4.428336143493652, "distillation_loss": 0.37443456053733826, "epoch": 1.8, "learning_rate": 2.0760233918128653e-05, "loss": 4.8305, "step": 720, "task_loss": 0.24049115180969238 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016712685000000005, "compression/movement_sparsity/importance_threshold": -0.00021960706761987598, "compression/movement_sparsity/linear_layer_sparsity": 0.3912986134635652, "compression/movement_sparsity/model_sparsity": 0.3517678446164528, "compression_loss": 4.542544841766357, "distillation_loss": 0.6300625801086426, "epoch": 1.83, "learning_rate": 2.1052631578947366e-05, "loss": 4.9322, "step": 730, "task_loss": 0.3629845976829529 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017128519999999994, "compression/movement_sparsity/importance_threshold": -0.00021568560630225698, "compression/movement_sparsity/linear_layer_sparsity": 0.40521868413128576, "compression/movement_sparsity/model_sparsity": 0.36428164631983834, "compression_loss": 4.655385971069336, "distillation_loss": 0.6407253742218018, "epoch": 1.85, "learning_rate": 2.1345029239766078e-05, "loss": 5.0662, "step": 740, "task_loss": 0.2929871678352356 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017539375000000003, "compression/movement_sparsity/importance_threshold": -0.00021181110802854163, "compression/movement_sparsity/linear_layer_sparsity": 0.42049019261894005, "compression/movement_sparsity/model_sparsity": 0.37801035743688965, "compression_loss": 4.766801357269287, "distillation_loss": 0.8653441667556763, "epoch": 1.88, "learning_rate": 2.163742690058479e-05, "loss": 5.1373, "step": 750, "task_loss": 0.5622042417526245 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017945279999999997, "compression/movement_sparsity/importance_threshold": -0.00020798328988882715, "compression/movement_sparsity/linear_layer_sparsity": 0.4363304068804577, "compression/movement_sparsity/model_sparsity": 0.3922503210792747, "compression_loss": 4.87684965133667, "distillation_loss": 0.9700570702552795, "epoch": 1.9, "learning_rate": 2.1929824561403507e-05, "loss": 5.3363, "step": 760, "task_loss": 0.39909374713897705 }, { "compression/movement_sparsity/importance_regularization_factor": 0.018346265000000007, "compression/movement_sparsity/importance_threshold": -0.0002042018689732103, "compression/movement_sparsity/linear_layer_sparsity": 0.452758193597561, "compression/movement_sparsity/model_sparsity": 0.407018497930564, "compression_loss": 4.985548973083496, "distillation_loss": 0.6564913988113403, "epoch": 1.93, "learning_rate": 2.222222222222222e-05, "loss": 5.4895, "step": 770, "task_loss": 0.320431113243103 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01874236, "compression/movement_sparsity/importance_threshold": -0.00020046656237178833, "compression/movement_sparsity/linear_layer_sparsity": 0.46551143527928335, "compression/movement_sparsity/model_sparsity": 0.4184833490286625, "compression_loss": 5.092935085296631, "distillation_loss": 0.4097839295864105, "epoch": 1.95, "learning_rate": 2.2514619883040933e-05, "loss": 5.6222, "step": 780, "task_loss": 0.12948280572891235 }, { "compression/movement_sparsity/importance_regularization_factor": 0.019133595000000003, "compression/movement_sparsity/importance_threshold": -0.0001967770871746579, "compression/movement_sparsity/linear_layer_sparsity": 0.47700519657106294, "compression/movement_sparsity/model_sparsity": 0.4288159581845132, "compression_loss": 5.199016571044922, "distillation_loss": 0.44741755723953247, "epoch": 1.98, "learning_rate": 2.2807017543859645e-05, "loss": 5.5957, "step": 790, "task_loss": 0.12202805280685425 }, { "epoch": 2.0, "eval_accuracy": 0.9545454545454546, "eval_loss": 5.479804515838623, "eval_runtime": 86.4849, "eval_samples_per_second": 78.603, "eval_steps_per_second": 2.463, "step": 798 }, { "compression/movement_sparsity/importance_regularization_factor": 0.019558376005000003, "compression/movement_sparsity/importance_threshold": -0.0001927712620768021, "compression/movement_sparsity/linear_layer_sparsity": 0.48800660098614873, "compression/movement_sparsity/model_sparsity": 0.4387059505987312, "compression_loss": 5.314239025115967, "distillation_loss": 0.7526724934577942, "epoch": 2.01, "learning_rate": 2.3099415204678358e-05, "loss": 5.8631, "step": 800, "task_loss": 0.40847355127334595 }, { "compression/movement_sparsity/importance_regularization_factor": 0.019939502655, "compression/movement_sparsity/importance_threshold": -0.00018917711195694988, "compression/movement_sparsity/linear_layer_sparsity": 0.49847956187895215, "compression/movement_sparsity/model_sparsity": 0.44812088526309046, "compression_loss": 5.417582988739014, "distillation_loss": 1.0302109718322754, "epoch": 2.03, "learning_rate": 2.3391812865497074e-05, "loss": 5.8711, "step": 810, "task_loss": 0.4869253635406494 }, { "compression/movement_sparsity/importance_regularization_factor": 0.020315862305000005, "compression/movement_sparsity/importance_threshold": -0.0001856279162206899, "compression/movement_sparsity/linear_layer_sparsity": 0.5098980799269798, "compression/movement_sparsity/model_sparsity": 0.4583858525905118, "compression_loss": 5.519580364227295, "distillation_loss": 1.0941754579544067, "epoch": 2.06, "learning_rate": 2.3684210526315787e-05, "loss": 5.9025, "step": 820, "task_loss": 0.6853399276733398 }, { "compression/movement_sparsity/importance_regularization_factor": 0.020687484955000005, "compression/movement_sparsity/importance_threshold": -0.00018212339195811914, "compression/movement_sparsity/linear_layer_sparsity": 0.5205379474367661, "compression/movement_sparsity/model_sparsity": 0.4679508322049121, "compression_loss": 5.620279312133789, "distillation_loss": 0.39905670285224915, "epoch": 2.08, "learning_rate": 2.39766081871345e-05, "loss": 6.0413, "step": 830, "task_loss": 0.3336109519004822 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021054400605000002, "compression/movement_sparsity/importance_threshold": -0.0001786632562593346, "compression/movement_sparsity/linear_layer_sparsity": 0.5318343025444143, "compression/movement_sparsity/model_sparsity": 0.47810597804881505, "compression_loss": 5.719671726226807, "distillation_loss": 1.3409157991409302, "epoch": 2.11, "learning_rate": 2.4269005847953213e-05, "loss": 6.1711, "step": 840, "task_loss": 0.5142614841461182 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021416639255000005, "compression/movement_sparsity/importance_threshold": -0.0001752472262144332, "compression/movement_sparsity/linear_layer_sparsity": 0.5451425822417947, "compression/movement_sparsity/model_sparsity": 0.4900697946932517, "compression_loss": 5.817708492279053, "distillation_loss": 1.0011571645736694, "epoch": 2.13, "learning_rate": 2.4561403508771925e-05, "loss": 6.2293, "step": 850, "task_loss": 0.8053411841392517 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021774230905000004, "compression/movement_sparsity/importance_threshold": -0.00017187501891351197, "compression/movement_sparsity/linear_layer_sparsity": 0.5574857441282746, "compression/movement_sparsity/model_sparsity": 0.5011659941255129, "compression_loss": 5.914428234100342, "distillation_loss": 0.577518105506897, "epoch": 2.16, "learning_rate": 2.4853801169590638e-05, "loss": 6.3091, "step": 860, "task_loss": 0.6429996490478516 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022127205555, "compression/movement_sparsity/importance_threshold": -0.00016854635144666782, "compression/movement_sparsity/linear_layer_sparsity": 0.5681574050549534, "compression/movement_sparsity/model_sparsity": 0.5107595552409675, "compression_loss": 6.009877681732178, "distillation_loss": 0.9484221935272217, "epoch": 2.18, "learning_rate": 2.5146198830409354e-05, "loss": 6.4402, "step": 870, "task_loss": 0.5138943195343018 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022475593205000005, "compression/movement_sparsity/importance_threshold": -0.00016526094090399767, "compression/movement_sparsity/linear_layer_sparsity": 0.5798891288392051, "compression/movement_sparsity/model_sparsity": 0.5213060868340473, "compression_loss": 6.104116439819336, "distillation_loss": 0.4834554195404053, "epoch": 2.21, "learning_rate": 2.5438596491228067e-05, "loss": 6.497, "step": 880, "task_loss": 0.3683658838272095 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022819423855000003, "compression/movement_sparsity/importance_threshold": -0.00016201850437559863, "compression/movement_sparsity/linear_layer_sparsity": 0.5897178466576333, "compression/movement_sparsity/model_sparsity": 0.5301418627948372, "compression_loss": 6.197108745574951, "distillation_loss": 0.4105452597141266, "epoch": 2.23, "learning_rate": 2.573099415204678e-05, "loss": 6.601, "step": 890, "task_loss": 0.09200382232666016 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023158727504999997, "compression/movement_sparsity/importance_threshold": -0.00015881875895156758, "compression/movement_sparsity/linear_layer_sparsity": 0.5997669541365552, "compression/movement_sparsity/model_sparsity": 0.5391757636484333, "compression_loss": 6.288847923278809, "distillation_loss": 0.6942120790481567, "epoch": 2.26, "learning_rate": 2.6023391812865492e-05, "loss": 6.6452, "step": 900, "task_loss": 0.3441739082336426 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023493534155000006, "compression/movement_sparsity/importance_threshold": -0.00015566142172200137, "compression/movement_sparsity/linear_layer_sparsity": 0.6099394948810599, "compression/movement_sparsity/model_sparsity": 0.548320627976711, "compression_loss": 6.3793840408325195, "distillation_loss": 0.8501518964767456, "epoch": 2.28, "learning_rate": 2.6315789473684205e-05, "loss": 6.7787, "step": 910, "task_loss": 0.4232085049152374 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023823873805, "compression/movement_sparsity/importance_threshold": -0.0001525462097769972, "compression/movement_sparsity/linear_layer_sparsity": 0.6198110626505571, "compression/movement_sparsity/model_sparsity": 0.5571949249912712, "compression_loss": 6.468705177307129, "distillation_loss": 0.4476397633552551, "epoch": 2.31, "learning_rate": 2.660818713450292e-05, "loss": 6.8706, "step": 920, "task_loss": 0.26603904366493225 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024149776455000004, "compression/movement_sparsity/importance_threshold": -0.00014947284020665178, "compression/movement_sparsity/linear_layer_sparsity": 0.6292107046070461, "compression/movement_sparsity/model_sparsity": 0.5656449722887386, "compression_loss": 6.55678129196167, "distillation_loss": 0.8385058641433716, "epoch": 2.33, "learning_rate": 2.6900584795321634e-05, "loss": 7.0258, "step": 930, "task_loss": 0.5302727818489075 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024471272104999997, "compression/movement_sparsity/importance_threshold": -0.00014644103010106235, "compression/movement_sparsity/linear_layer_sparsity": 0.6367144802958447, "compression/movement_sparsity/model_sparsity": 0.5723906823672124, "compression_loss": 6.6436238288879395, "distillation_loss": 0.4353128671646118, "epoch": 2.36, "learning_rate": 2.7192982456140347e-05, "loss": 7.0157, "step": 940, "task_loss": 0.17246079444885254 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024788390755, "compression/movement_sparsity/importance_threshold": -0.00014345049655032564, "compression/movement_sparsity/linear_layer_sparsity": 0.6451845264980428, "compression/movement_sparsity/model_sparsity": 0.5800050459091022, "compression_loss": 6.729299545288086, "distillation_loss": 0.4371870756149292, "epoch": 2.38, "learning_rate": 2.748538011695906e-05, "loss": 7.135, "step": 950, "task_loss": 0.2832848131656647 }, { "compression/movement_sparsity/importance_regularization_factor": 0.025101162405, "compression/movement_sparsity/importance_threshold": -0.00014050095664453873, "compression/movement_sparsity/linear_layer_sparsity": 0.653826243036736, "compression/movement_sparsity/model_sparsity": 0.5877737368678947, "compression_loss": 6.813790321350098, "distillation_loss": 0.9059510231018066, "epoch": 2.41, "learning_rate": 2.7777777777777772e-05, "loss": 7.2781, "step": 960, "task_loss": 0.3703336715698242 }, { "compression/movement_sparsity/importance_regularization_factor": 0.025409617055, "compression/movement_sparsity/importance_threshold": -0.00013759212747379855, "compression/movement_sparsity/linear_layer_sparsity": 0.6619689099668774, "compression/movement_sparsity/model_sparsity": 0.5950937944834637, "compression_loss": 6.897071838378906, "distillation_loss": 0.3913211226463318, "epoch": 2.43, "learning_rate": 2.807017543859649e-05, "loss": 7.3028, "step": 970, "task_loss": 0.11039167642593384 }, { "compression/movement_sparsity/importance_regularization_factor": 0.025713784705, "compression/movement_sparsity/importance_threshold": -0.00013472372612820209, "compression/movement_sparsity/linear_layer_sparsity": 0.6690985866455886, "compression/movement_sparsity/model_sparsity": 0.6015031987383959, "compression_loss": 6.979084491729736, "distillation_loss": 0.7541512250900269, "epoch": 2.46, "learning_rate": 2.83625730994152e-05, "loss": 7.3291, "step": 980, "task_loss": 0.5565091967582703 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026013695355, "compression/movement_sparsity/importance_threshold": -0.0001318954696978463, "compression/movement_sparsity/linear_layer_sparsity": 0.6808277697794339, "compression/movement_sparsity/model_sparsity": 0.6120474463491494, "compression_loss": 7.059789657592773, "distillation_loss": 0.4585632085800171, "epoch": 2.48, "learning_rate": 2.8654970760233914e-05, "loss": 7.4721, "step": 990, "task_loss": 0.2957744300365448 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026309379005, "compression/movement_sparsity/importance_threshold": -0.00012910707527282811, "compression/movement_sparsity/linear_layer_sparsity": 0.6891670665838603, "compression/movement_sparsity/model_sparsity": 0.6195442693931776, "compression_loss": 7.139459609985352, "distillation_loss": 0.43910276889801025, "epoch": 2.51, "learning_rate": 2.8947368421052627e-05, "loss": 7.5405, "step": 1000, "task_loss": 0.24162358045578003 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026600865655, "compression/movement_sparsity/importance_threshold": -0.00012635825994324458, "compression/movement_sparsity/linear_layer_sparsity": 0.6972308322041554, "compression/movement_sparsity/model_sparsity": 0.6267933966687271, "compression_loss": 7.217947483062744, "distillation_loss": 0.6317572593688965, "epoch": 2.53, "learning_rate": 2.923976608187134e-05, "loss": 7.5692, "step": 1010, "task_loss": 0.2147519886493683 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026888185305, "compression/movement_sparsity/importance_threshold": -0.00012364874079919257, "compression/movement_sparsity/linear_layer_sparsity": 0.7072264684018368, "compression/movement_sparsity/model_sparsity": 0.6357792281535505, "compression_loss": 7.2952799797058105, "distillation_loss": 0.9247174263000488, "epoch": 2.56, "learning_rate": 2.9532163742690056e-05, "loss": 7.7218, "step": 1020, "task_loss": 0.4004303216934204 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027171367955, "compression/movement_sparsity/importance_threshold": -0.00012097823493076911, "compression/movement_sparsity/linear_layer_sparsity": 0.71374941188648, "compression/movement_sparsity/model_sparsity": 0.6416431941662016, "compression_loss": 7.371541976928711, "distillation_loss": 1.099261999130249, "epoch": 2.58, "learning_rate": 2.982456140350877e-05, "loss": 7.7844, "step": 1030, "task_loss": 0.45426321029663086 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027450443605000003, "compression/movement_sparsity/importance_threshold": -0.00011834645942807113, "compression/movement_sparsity/linear_layer_sparsity": 0.7193642610471244, "compression/movement_sparsity/model_sparsity": 0.6466908056811099, "compression_loss": 7.4467082023620605, "distillation_loss": 1.6781150102615356, "epoch": 2.61, "learning_rate": 3.011695906432748e-05, "loss": 7.8146, "step": 1040, "task_loss": 0.8923892974853516 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027725442255, "compression/movement_sparsity/importance_threshold": -0.00011575313138119565, "compression/movement_sparsity/linear_layer_sparsity": 0.7244986567487203, "compression/movement_sparsity/model_sparsity": 0.6513065013345436, "compression_loss": 7.520779609680176, "distillation_loss": 0.12322430312633514, "epoch": 2.63, "learning_rate": 3.0409356725146194e-05, "loss": 7.8336, "step": 1050, "task_loss": 0.023247644305229187 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027996393905000003, "compression/movement_sparsity/importance_threshold": -0.00011319796788023954, "compression/movement_sparsity/linear_layer_sparsity": 0.7297716943315267, "compression/movement_sparsity/model_sparsity": 0.6560468326346391, "compression_loss": 7.5938239097595215, "distillation_loss": 1.5146890878677368, "epoch": 2.66, "learning_rate": 3.070175438596491e-05, "loss": 8.0602, "step": 1060, "task_loss": 0.8802045583724976 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028263328555, "compression/movement_sparsity/importance_threshold": -0.0001106806860152999, "compression/movement_sparsity/linear_layer_sparsity": 0.7346848182023487, "compression/movement_sparsity/model_sparsity": 0.6604636103458478, "compression_loss": 7.665744304656982, "distillation_loss": 0.43239685893058777, "epoch": 2.68, "learning_rate": 3.099415204678362e-05, "loss": 8.035, "step": 1070, "task_loss": 0.1623140275478363 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028526276205, "compression/movement_sparsity/importance_threshold": -0.00010820100287647357, "compression/movement_sparsity/linear_layer_sparsity": 0.7404099974593495, "compression/movement_sparsity/model_sparsity": 0.6656104059080701, "compression_loss": 7.736471176147461, "distillation_loss": 0.46192824840545654, "epoch": 2.71, "learning_rate": 3.128654970760233e-05, "loss": 8.0645, "step": 1080, "task_loss": 0.21585744619369507 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028785266855, "compression/movement_sparsity/importance_threshold": -0.0001057586355538575, "compression/movement_sparsity/linear_layer_sparsity": 0.7466543633318278, "compression/movement_sparsity/model_sparsity": 0.6712239374882496, "compression_loss": 7.806110858917236, "distillation_loss": 2.1456828117370605, "epoch": 2.73, "learning_rate": 3.1578947368421045e-05, "loss": 8.1415, "step": 1090, "task_loss": 1.1881322860717773 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029040330505000003, "compression/movement_sparsity/importance_threshold": -0.0001033533011375488, "compression/movement_sparsity/linear_layer_sparsity": 0.7539027095566094, "compression/movement_sparsity/model_sparsity": 0.6777400227510018, "compression_loss": 7.874512195587158, "distillation_loss": 0.3218111991882324, "epoch": 2.76, "learning_rate": 3.1871345029239764e-05, "loss": 8.2399, "step": 1100, "task_loss": 0.1626085638999939 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029291497154999995, "compression/movement_sparsity/importance_threshold": -0.00010098471671764442, "compression/movement_sparsity/linear_layer_sparsity": 0.7604099739348088, "compression/movement_sparsity/model_sparsity": 0.6835898936319823, "compression_loss": 7.941775798797607, "distillation_loss": 0.9839245676994324, "epoch": 2.78, "learning_rate": 3.216374269005848e-05, "loss": 8.3905, "step": 1110, "task_loss": 0.3942227065563202 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029538796805, "compression/movement_sparsity/importance_threshold": -9.86525993842411e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.766957206507829, "compression/movement_sparsity/model_sparsity": 0.6894756949386317, "compression_loss": 8.007960319519043, "distillation_loss": 0.7333301305770874, "epoch": 2.81, "learning_rate": 3.245614035087719e-05, "loss": 8.397, "step": 1120, "task_loss": 0.3495325744152069 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029782259455, "compression/movement_sparsity/importance_threshold": -9.635666622743598e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.7723407153342366, "compression/movement_sparsity/model_sparsity": 0.6943153371739483, "compression_loss": 8.0732421875, "distillation_loss": 0.6021356582641602, "epoch": 2.83, "learning_rate": 3.27485380116959e-05, "loss": 8.4913, "step": 1130, "task_loss": 0.40355244278907776 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030021915105000005, "compression/movement_sparsity/importance_threshold": -9.409663433732605e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.7760336447982535, "compression/movement_sparsity/model_sparsity": 0.6976351900770273, "compression_loss": 8.137558937072754, "distillation_loss": 0.49863767623901367, "epoch": 2.86, "learning_rate": 3.3040935672514615e-05, "loss": 8.5122, "step": 1140, "task_loss": 0.14769184589385986 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030257793755, "compression/movement_sparsity/importance_threshold": -9.187222080400827e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.7808857460102379, "compression/movement_sparsity/model_sparsity": 0.7019971099164379, "compression_loss": 8.200807571411133, "distillation_loss": 0.9741989970207214, "epoch": 2.88, "learning_rate": 3.333333333333333e-05, "loss": 8.5404, "step": 1150, "task_loss": 0.5826123952865601 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030489925404999996, "compression/movement_sparsity/importance_threshold": -8.968314271757958e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.7852331987729599, "compression/movement_sparsity/model_sparsity": 0.7059053631923137, "compression_loss": 8.263005256652832, "distillation_loss": 0.46970468759536743, "epoch": 2.91, "learning_rate": 3.362573099415204e-05, "loss": 8.6595, "step": 1160, "task_loss": 0.5217117071151733 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030718340054999994, "compression/movement_sparsity/importance_threshold": -8.752911716813688e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.7899744405864197, "compression/movement_sparsity/model_sparsity": 0.710167623154759, "compression_loss": 8.32394790649414, "distillation_loss": 0.6907594203948975, "epoch": 2.93, "learning_rate": 3.3918128654970754e-05, "loss": 8.7009, "step": 1170, "task_loss": 0.17913195490837097 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030943067705000004, "compression/movement_sparsity/importance_threshold": -8.540986124577711e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.7945701713527552, "compression/movement_sparsity/model_sparsity": 0.7142990722590679, "compression_loss": 8.384001731872559, "distillation_loss": 0.9414887428283691, "epoch": 2.96, "learning_rate": 3.4210526315789466e-05, "loss": 8.8272, "step": 1180, "task_loss": 0.6663200259208679 }, { "compression/movement_sparsity/importance_regularization_factor": 0.031164138355, "compression/movement_sparsity/importance_threshold": -8.33250920405974e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.7987915561013249, "compression/movement_sparsity/model_sparsity": 0.7180939934859978, "compression_loss": 8.443134307861328, "distillation_loss": 0.538488507270813, "epoch": 2.98, "learning_rate": 3.450292397660818e-05, "loss": 8.7806, "step": 1190, "task_loss": 0.26449117064476013 }, { "epoch": 3.0, "eval_accuracy": 0.9633715798764343, "eval_loss": 8.649137496948242, "eval_runtime": 86.2151, "eval_samples_per_second": 78.849, "eval_steps_per_second": 2.471, "step": 1197 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03140312804, "compression/movement_sparsity/importance_threshold": -8.107134042027329e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8017074229336043, "compression/movement_sparsity/model_sparsity": 0.7207152861650097, "compression_loss": 8.507065773010254, "distillation_loss": 0.6942938566207886, "epoch": 3.01, "learning_rate": 3.47953216374269e-05, "loss": 9.0818, "step": 1200, "task_loss": 0.3004140853881836 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03161661664, "compression/movement_sparsity/importance_threshold": -7.905807244943714e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.804804922745408, "compression/movement_sparsity/model_sparsity": 0.7234998624323609, "compression_loss": 8.564077377319336, "distillation_loss": 0.49231940507888794, "epoch": 3.03, "learning_rate": 3.508771929824561e-05, "loss": 8.9, "step": 1210, "task_loss": 0.2517680525779724 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03182654124, "compression/movement_sparsity/importance_threshold": -7.70784141750815e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8087666200880759, "compression/movement_sparsity/model_sparsity": 0.7270613310583741, "compression_loss": 8.620096206665039, "distillation_loss": 0.5323042273521423, "epoch": 3.06, "learning_rate": 3.5380116959064324e-05, "loss": 8.9514, "step": 1220, "task_loss": 0.19435852766036987 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03203293184, "compression/movement_sparsity/importance_threshold": -7.51320826873034e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8120925784778681, "compression/movement_sparsity/model_sparsity": 0.7300512859772156, "compression_loss": 8.675074577331543, "distillation_loss": 0.7101088762283325, "epoch": 3.08, "learning_rate": 3.567251461988304e-05, "loss": 9.0237, "step": 1230, "task_loss": 0.5060630440711975 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03223581844, "compression/movement_sparsity/importance_threshold": -7.321879507619987e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8161153220039145, "compression/movement_sparsity/model_sparsity": 0.7336676336230114, "compression_loss": 8.729106903076172, "distillation_loss": 0.5983076095581055, "epoch": 3.11, "learning_rate": 3.5964912280701756e-05, "loss": 9.0547, "step": 1240, "task_loss": 0.27109333872795105 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03243523104, "compression/movement_sparsity/importance_threshold": -7.133826843186772e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8184793031090033, "compression/movement_sparsity/model_sparsity": 0.7357927945856084, "compression_loss": 8.78226375579834, "distillation_loss": 0.8876118659973145, "epoch": 3.13, "learning_rate": 3.625730994152046e-05, "loss": 9.1706, "step": 1250, "task_loss": 0.4600517749786377 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03263119964, "compression/movement_sparsity/importance_threshold": -6.949021984440403e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8217362052092743, "compression/movement_sparsity/model_sparsity": 0.7387206695959445, "compression_loss": 8.834550857543945, "distillation_loss": 0.35698211193084717, "epoch": 3.16, "learning_rate": 3.654970760233918e-05, "loss": 9.1858, "step": 1260, "task_loss": 0.1868249475955963 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03282375424, "compression/movement_sparsity/importance_threshold": -6.767436640390569e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8237155835968082, "compression/movement_sparsity/model_sparsity": 0.7405000821599198, "compression_loss": 8.88587760925293, "distillation_loss": 1.1052513122558594, "epoch": 3.18, "learning_rate": 3.684210526315789e-05, "loss": 9.3054, "step": 1270, "task_loss": 0.6997582316398621 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03301292484, "compression/movement_sparsity/importance_threshold": -6.58904252004697e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8271621287827462, "compression/movement_sparsity/model_sparsity": 0.743598441647318, "compression_loss": 8.936222076416016, "distillation_loss": 0.47881120443344116, "epoch": 3.21, "learning_rate": 3.713450292397661e-05, "loss": 9.2533, "step": 1280, "task_loss": 0.2920542061328888 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03319874144, "compression/movement_sparsity/importance_threshold": -6.413811332419306e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8296238778794038, "compression/movement_sparsity/model_sparsity": 0.745811493633505, "compression_loss": 8.985671997070312, "distillation_loss": 1.6269311904907227, "epoch": 3.23, "learning_rate": 3.7426900584795313e-05, "loss": 9.2957, "step": 1290, "task_loss": 0.6221576929092407 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03338123404, "compression/movement_sparsity/importance_threshold": -6.241714786517268e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8331334335478772, "compression/movement_sparsity/model_sparsity": 0.7489664979973916, "compression_loss": 9.034184455871582, "distillation_loss": 1.1283040046691895, "epoch": 3.26, "learning_rate": 3.771929824561403e-05, "loss": 9.4048, "step": 1300, "task_loss": 0.692903459072113 }, { "compression/movement_sparsity/importance_regularization_factor": 0.033560432640000006, "compression/movement_sparsity/importance_threshold": -6.0727245913505545e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8352013465447155, "compression/movement_sparsity/model_sparsity": 0.7508255010010398, "compression_loss": 9.081707000732422, "distillation_loss": 0.40741151571273804, "epoch": 3.28, "learning_rate": 3.8011695906432746e-05, "loss": 9.4243, "step": 1310, "task_loss": 0.13411134481430054 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03373636724, "compression/movement_sparsity/importance_threshold": -5.906812455928862e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8383988256549232, "compression/movement_sparsity/model_sparsity": 0.7536999562025241, "compression_loss": 9.128183364868164, "distillation_loss": 0.6723507046699524, "epoch": 3.31, "learning_rate": 3.830409356725146e-05, "loss": 9.5088, "step": 1320, "task_loss": 0.2996169328689575 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03390906784, "compression/movement_sparsity/importance_threshold": -5.7439500892618944e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8412121019647697, "compression/movement_sparsity/model_sparsity": 0.7562290225211231, "compression_loss": 9.1739501953125, "distillation_loss": 0.4882902503013611, "epoch": 3.33, "learning_rate": 3.859649122807017e-05, "loss": 9.4872, "step": 1330, "task_loss": 0.26708441972732544 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03407856444, "compression/movement_sparsity/importance_threshold": -5.5841092003593325e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8429411557136405, "compression/movement_sparsity/model_sparsity": 0.7577833993820136, "compression_loss": 9.218880653381348, "distillation_loss": 0.8691713809967041, "epoch": 3.36, "learning_rate": 3.8888888888888884e-05, "loss": 9.5652, "step": 1340, "task_loss": 0.5940840840339661 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03424488704, "compression/movement_sparsity/importance_threshold": -5.427261498230885e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8445117246311352, "compression/movement_sparsity/model_sparsity": 0.7591953022713147, "compression_loss": 9.263068199157715, "distillation_loss": 0.4419710636138916, "epoch": 3.38, "learning_rate": 3.91812865497076e-05, "loss": 9.546, "step": 1350, "task_loss": 0.26113927364349365 }, { "compression/movement_sparsity/importance_regularization_factor": 0.034408065640000006, "compression/movement_sparsity/importance_threshold": -5.2733786918862386e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.846551690473502, "compression/movement_sparsity/model_sparsity": 0.7610291814693747, "compression_loss": 9.306303977966309, "distillation_loss": 1.0447174310684204, "epoch": 3.41, "learning_rate": 3.9473684210526316e-05, "loss": 9.6604, "step": 1360, "task_loss": 0.6508020162582397 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03456813024, "compression/movement_sparsity/importance_threshold": -5.122432490335102e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8479912041741945, "compression/movement_sparsity/model_sparsity": 0.7623232689370156, "compression_loss": 9.348631858825684, "distillation_loss": 0.6512007713317871, "epoch": 3.43, "learning_rate": 3.976608187134502e-05, "loss": 9.6694, "step": 1370, "task_loss": 0.38973358273506165 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03472511084, "compression/movement_sparsity/importance_threshold": -4.974394602587167e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8504058218533574, "compression/movement_sparsity/model_sparsity": 0.7644939509362529, "compression_loss": 9.389874458312988, "distillation_loss": 1.7942403554916382, "epoch": 3.46, "learning_rate": 4.005847953216374e-05, "loss": 9.7758, "step": 1380, "task_loss": 0.7294902801513672 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03487903744, "compression/movement_sparsity/importance_threshold": -4.8292367376521204e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8521922637195122, "compression/movement_sparsity/model_sparsity": 0.7660999183053359, "compression_loss": 9.430490493774414, "distillation_loss": 0.4566546678543091, "epoch": 3.48, "learning_rate": 4.035087719298245e-05, "loss": 9.8213, "step": 1390, "task_loss": 0.14384031295776367 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03502994004, "compression/movement_sparsity/importance_threshold": -4.686930604539676e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8542740561954231, "compression/movement_sparsity/model_sparsity": 0.7679713986198402, "compression_loss": 9.47036361694336, "distillation_loss": 0.6844474673271179, "epoch": 3.51, "learning_rate": 4.064327485380117e-05, "loss": 9.8039, "step": 1400, "task_loss": 0.5877740979194641 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03517784864, "compression/movement_sparsity/importance_threshold": -4.547447912259516e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8564435834462512, "compression/movement_sparsity/model_sparsity": 0.7699217503425445, "compression_loss": 9.50930118560791, "distillation_loss": 1.0622820854187012, "epoch": 3.53, "learning_rate": 4.093567251461988e-05, "loss": 9.821, "step": 1410, "task_loss": 0.4445667266845703 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035322793240000004, "compression/movement_sparsity/importance_threshold": -4.4107603698213425e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8580350892050587, "compression/movement_sparsity/model_sparsity": 0.7713524749380523, "compression_loss": 9.547521591186523, "distillation_loss": 0.6739060282707214, "epoch": 3.56, "learning_rate": 4.122807017543859e-05, "loss": 9.8988, "step": 1420, "task_loss": 0.3877226710319519 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03546480384, "compression/movement_sparsity/importance_threshold": -4.2768396862348534e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8592993215522433, "compression/movement_sparsity/model_sparsity": 0.7724889887731695, "compression_loss": 9.585060119628906, "distillation_loss": 0.45944637060165405, "epoch": 3.58, "learning_rate": 4.1520467836257306e-05, "loss": 9.9251, "step": 1430, "task_loss": 0.2797248959541321 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035603910440000004, "compression/movement_sparsity/importance_threshold": -4.145657570509741e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.860241573509485, "compression/movement_sparsity/model_sparsity": 0.773336050144409, "compression_loss": 9.62182903289795, "distillation_loss": 0.20794469118118286, "epoch": 3.61, "learning_rate": 4.181286549707602e-05, "loss": 9.9025, "step": 1440, "task_loss": 0.049867182970047 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035740143040000005, "compression/movement_sparsity/importance_threshold": -4.017185731655708e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.861661949996236, "compression/movement_sparsity/model_sparsity": 0.7746129337266576, "compression_loss": 9.657830238342285, "distillation_loss": 1.0586156845092773, "epoch": 3.63, "learning_rate": 4.210526315789473e-05, "loss": 9.9896, "step": 1450, "task_loss": 0.6514222025871277 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035873531640000005, "compression/movement_sparsity/importance_threshold": -3.891395878682447e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8629507537262873, "compression/movement_sparsity/model_sparsity": 0.775771536631587, "compression_loss": 9.692946434020996, "distillation_loss": 0.4212111234664917, "epoch": 3.66, "learning_rate": 4.239766081871345e-05, "loss": 10.0329, "step": 1460, "task_loss": 0.21880501508712769 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03600410624, "compression/movement_sparsity/importance_threshold": -3.76825972059965e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.864051102359982, "compression/movement_sparsity/model_sparsity": 0.7767607230326717, "compression_loss": 9.7272367477417, "distillation_loss": 0.5302681922912598, "epoch": 3.68, "learning_rate": 4.2690058479532157e-05, "loss": 10.0569, "step": 1470, "task_loss": 0.4268465042114258 }, { "compression/movement_sparsity/importance_regularization_factor": 0.036131896840000004, "compression/movement_sparsity/importance_threshold": -3.647748966417025e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8658141373080398, "compression/movement_sparsity/model_sparsity": 0.7783456481571752, "compression_loss": 9.760616302490234, "distillation_loss": 0.399345338344574, "epoch": 3.71, "learning_rate": 4.2982456140350876e-05, "loss": 10.0997, "step": 1480, "task_loss": 0.30424150824546814 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03625693344, "compression/movement_sparsity/importance_threshold": -3.529835325144254e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8674597847975007, "compression/movement_sparsity/model_sparsity": 0.7798250448390142, "compression_loss": 9.793109893798828, "distillation_loss": 0.43434837460517883, "epoch": 3.73, "learning_rate": 4.327485380116958e-05, "loss": 10.1041, "step": 1490, "task_loss": 0.2279742956161499 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03637924604, "compression/movement_sparsity/importance_threshold": -3.4144905057910456e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8694156856933153, "compression/movement_sparsity/model_sparsity": 0.7815833517144565, "compression_loss": 9.824585914611816, "distillation_loss": 0.8438689708709717, "epoch": 3.76, "learning_rate": 4.35672514619883e-05, "loss": 10.1255, "step": 1500, "task_loss": 0.44409793615341187 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03649886464, "compression/movement_sparsity/importance_threshold": -3.3016862173670914e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8707779212774767, "compression/movement_sparsity/model_sparsity": 0.7828079680530087, "compression_loss": 9.855549812316895, "distillation_loss": 0.7053812742233276, "epoch": 3.78, "learning_rate": 4.3859649122807014e-05, "loss": 10.2668, "step": 1510, "task_loss": 0.37573811411857605 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03661581924, "compression/movement_sparsity/importance_threshold": -3.191394168882084e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8729380622365251, "compression/movement_sparsity/model_sparsity": 0.7847498817298967, "compression_loss": 9.886014938354492, "distillation_loss": 0.4807725250720978, "epoch": 3.81, "learning_rate": 4.415204678362573e-05, "loss": 10.2452, "step": 1520, "task_loss": 0.16565313935279846 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03673013984, "compression/movement_sparsity/importance_threshold": -3.083586069345726e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8737130194218609, "compression/movement_sparsity/model_sparsity": 0.7854465492093509, "compression_loss": 9.915853500366211, "distillation_loss": 0.5819356441497803, "epoch": 3.83, "learning_rate": 4.444444444444444e-05, "loss": 10.2462, "step": 1530, "task_loss": 0.3054378628730774 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03684185644, "compression/movement_sparsity/importance_threshold": -2.9782336277677207e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8748435029923216, "compression/movement_sparsity/model_sparsity": 0.7864628261785824, "compression_loss": 9.944876670837402, "distillation_loss": 0.7969239354133606, "epoch": 3.86, "learning_rate": 4.473684210526315e-05, "loss": 10.3467, "step": 1540, "task_loss": 0.3591158986091614 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03695099904, "compression/movement_sparsity/importance_threshold": -2.875308553157744e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8757265789671785, "compression/movement_sparsity/model_sparsity": 0.7872566897948075, "compression_loss": 9.973067283630371, "distillation_loss": 0.6269478797912598, "epoch": 3.88, "learning_rate": 4.5029239766081865e-05, "loss": 10.2859, "step": 1550, "task_loss": 0.30371320247650146 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03705759764, "compression/movement_sparsity/importance_threshold": -2.77478255452551e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8767618469587474, "compression/movement_sparsity/model_sparsity": 0.7881873702967676, "compression_loss": 10.00046443939209, "distillation_loss": 0.4931795001029968, "epoch": 3.91, "learning_rate": 4.5321637426900585e-05, "loss": 10.3543, "step": 1560, "task_loss": 0.308719664812088 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03716168224, "compression/movement_sparsity/importance_threshold": -2.6766273408807104e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8784133638211382, "compression/movement_sparsity/model_sparsity": 0.7896720434007397, "compression_loss": 10.027097702026367, "distillation_loss": 0.1957966387271881, "epoch": 3.93, "learning_rate": 4.561403508771929e-05, "loss": 10.292, "step": 1570, "task_loss": 0.058112651109695435 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03726328284, "compression/movement_sparsity/importance_threshold": -2.5808146212330377e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8799685947380307, "compression/movement_sparsity/model_sparsity": 0.7910701578041456, "compression_loss": 10.053182601928711, "distillation_loss": 0.5972878932952881, "epoch": 3.96, "learning_rate": 4.590643274853801e-05, "loss": 10.4266, "step": 1580, "task_loss": 0.23346292972564697 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03736242944, "compression/movement_sparsity/importance_threshold": -2.4873161045922004e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8811241084199036, "compression/movement_sparsity/model_sparsity": 0.7921089362288872, "compression_loss": 10.078638076782227, "distillation_loss": 0.35232701897621155, "epoch": 3.98, "learning_rate": 4.6198830409356716e-05, "loss": 10.4524, "step": 1590, "task_loss": 0.06011241674423218 }, { "epoch": 4.0, "eval_accuracy": 0.9554280670785525, "eval_loss": 10.270051956176758, "eval_runtime": 86.3945, "eval_samples_per_second": 78.686, "eval_steps_per_second": 2.465, "step": 1596 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037468692134999995, "compression/movement_sparsity/importance_threshold": -2.3871068754632266e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8816427657332129, "compression/movement_sparsity/model_sparsity": 0.7925751964171931, "compression_loss": 10.105827331542969, "distillation_loss": 0.40998631715774536, "epoch": 4.01, "learning_rate": 4.6491228070175436e-05, "loss": 10.7045, "step": 1600, "task_loss": 0.21127372980117798 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037562782985, "compression/movement_sparsity/importance_threshold": -2.298376097963279e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8831190835779885, "compression/movement_sparsity/model_sparsity": 0.7939023699065872, "compression_loss": 10.129731178283691, "distillation_loss": 0.547886312007904, "epoch": 4.04, "learning_rate": 4.678362573099415e-05, "loss": 10.4625, "step": 1610, "task_loss": 0.2604964077472687 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037654512835, "compression/movement_sparsity/importance_threshold": -2.2118718214002167e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8844327409853959, "compression/movement_sparsity/model_sparsity": 0.795083315657143, "compression_loss": 10.153118133544922, "distillation_loss": 0.3340034782886505, "epoch": 4.06, "learning_rate": 4.707602339181286e-05, "loss": 10.4671, "step": 1620, "task_loss": 0.111962229013443 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037743911685000006, "compression/movement_sparsity/importance_threshold": -2.1275657547837376e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8854097386894008, "compression/movement_sparsity/model_sparsity": 0.7959616126014918, "compression_loss": 10.175905227661133, "distillation_loss": 0.7999333143234253, "epoch": 4.09, "learning_rate": 4.7368421052631574e-05, "loss": 10.5007, "step": 1630, "task_loss": 0.41760003566741943 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037831009535, "compression/movement_sparsity/importance_threshold": -2.045429607123539e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8865368700127974, "compression/movement_sparsity/model_sparsity": 0.7969748759829319, "compression_loss": 10.197953224182129, "distillation_loss": 0.8680118322372437, "epoch": 4.11, "learning_rate": 4.766081871345029e-05, "loss": 10.548, "step": 1640, "task_loss": 0.4344426393508911 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037915836385, "compression/movement_sparsity/importance_threshold": -1.9654350874293187e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8878363656466426, "compression/movement_sparsity/model_sparsity": 0.7981430906468174, "compression_loss": 10.219234466552734, "distillation_loss": 0.37934863567352295, "epoch": 4.14, "learning_rate": 4.7953216374269e-05, "loss": 10.6091, "step": 1650, "task_loss": 0.16707447171211243 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037998422235, "compression/movement_sparsity/importance_threshold": -1.8875539047107635e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8892580006963264, "compression/movement_sparsity/model_sparsity": 0.7994211056462369, "compression_loss": 10.240059852600098, "distillation_loss": 0.2461932748556137, "epoch": 4.16, "learning_rate": 4.824561403508772e-05, "loss": 10.5569, "step": 1660, "task_loss": 0.21289575099945068 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038078797085000005, "compression/movement_sparsity/importance_threshold": -1.8117577679775763e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8904640568541102, "compression/movement_sparsity/model_sparsity": 0.8005053205156809, "compression_loss": 10.260123252868652, "distillation_loss": 0.2822727560997009, "epoch": 4.19, "learning_rate": 4.8538011695906425e-05, "loss": 10.5789, "step": 1670, "task_loss": 0.10792559385299683 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038156990935, "compression/movement_sparsity/importance_threshold": -1.73801838623946e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8910806821175851, "compression/movement_sparsity/model_sparsity": 0.801059651485445, "compression_loss": 10.279339790344238, "distillation_loss": 0.46864134073257446, "epoch": 4.21, "learning_rate": 4.8830409356725145e-05, "loss": 10.6432, "step": 1680, "task_loss": 0.26517313718795776 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038233033785, "compression/movement_sparsity/importance_threshold": -1.666307468506102e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8926093067788317, "compression/movement_sparsity/model_sparsity": 0.8024338475183792, "compression_loss": 10.29808521270752, "distillation_loss": 0.48441100120544434, "epoch": 4.24, "learning_rate": 4.912280701754385e-05, "loss": 10.5869, "step": 1690, "task_loss": 0.33757680654525757 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038306955635, "compression/movement_sparsity/importance_threshold": -1.596596723787199e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8934476390770852, "compression/movement_sparsity/model_sparsity": 0.8031874876680816, "compression_loss": 10.316293716430664, "distillation_loss": 0.4501994550228119, "epoch": 4.26, "learning_rate": 4.941520467836257e-05, "loss": 10.6399, "step": 1700, "task_loss": 0.1676151156425476 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038378786485, "compression/movement_sparsity/importance_threshold": -1.5288578610924547e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8948277650745258, "compression/movement_sparsity/model_sparsity": 0.8044281870488474, "compression_loss": 10.333884239196777, "distillation_loss": 0.3706062138080597, "epoch": 4.29, "learning_rate": 4.9707602339181276e-05, "loss": 10.66, "step": 1710, "task_loss": 0.272488534450531 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038448556335, "compression/movement_sparsity/importance_threshold": -1.4630625894315556e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8957719930931948, "compression/movement_sparsity/model_sparsity": 0.8052770248507851, "compression_loss": 10.350982666015625, "distillation_loss": 0.6956163644790649, "epoch": 4.31, "learning_rate": 4.9999999999999996e-05, "loss": 10.6792, "step": 1720, "task_loss": 0.3349671959877014 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038516295185, "compression/movement_sparsity/importance_threshold": -1.3991826178142103e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8963361351061427, "compression/movement_sparsity/model_sparsity": 0.8057841746671255, "compression_loss": 10.367738723754883, "distillation_loss": 1.0592408180236816, "epoch": 4.34, "learning_rate": 5.029239766081871e-05, "loss": 10.6682, "step": 1730, "task_loss": 0.615516722202301 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038582033035, "compression/movement_sparsity/importance_threshold": -1.3371896552501054e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.896996621875941, "compression/movement_sparsity/model_sparsity": 0.8063779360539933, "compression_loss": 10.383992195129395, "distillation_loss": 0.4240041673183441, "epoch": 4.36, "learning_rate": 5.058479532163742e-05, "loss": 10.802, "step": 1740, "task_loss": 0.16404575109481812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038645799885, "compression/movement_sparsity/importance_threshold": -1.2770554107489386e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8979182428109004, "compression/movement_sparsity/model_sparsity": 0.8072064506428247, "compression_loss": 10.399662017822266, "distillation_loss": 0.40452438592910767, "epoch": 4.39, "learning_rate": 5.0877192982456134e-05, "loss": 10.7784, "step": 1750, "task_loss": 0.19599807262420654 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038707625735, "compression/movement_sparsity/importance_threshold": -1.2187515933204075e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8988828077950919, "compression/movement_sparsity/model_sparsity": 0.8080735708773643, "compression_loss": 10.414520263671875, "distillation_loss": 0.8717334866523743, "epoch": 4.41, "learning_rate": 5.1169590643274853e-05, "loss": 10.7116, "step": 1760, "task_loss": 0.3821563720703125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038767540585, "compression/movement_sparsity/importance_threshold": -1.162249911974215e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9006762482121349, "compression/movement_sparsity/model_sparsity": 0.8096858297718366, "compression_loss": 10.428637504577637, "distillation_loss": 0.6464477777481079, "epoch": 4.44, "learning_rate": 5.146198830409356e-05, "loss": 10.7727, "step": 1770, "task_loss": 0.4439537823200226 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038825574435, "compression/movement_sparsity/importance_threshold": -1.1075220757200482e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9014851630721168, "compression/movement_sparsity/model_sparsity": 0.8104130243669192, "compression_loss": 10.442300796508789, "distillation_loss": 0.9200720191001892, "epoch": 4.46, "learning_rate": 5.175438596491228e-05, "loss": 10.7621, "step": 1780, "task_loss": 0.31766802072525024 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038881757285, "compression/movement_sparsity/importance_threshold": -1.0545397935676098e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9020580444519724, "compression/movement_sparsity/model_sparsity": 0.810928030659502, "compression_loss": 10.45583724975586, "distillation_loss": 0.8315671682357788, "epoch": 4.49, "learning_rate": 5.2046783625730985e-05, "loss": 10.8142, "step": 1790, "task_loss": 0.42072778940200806 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038936119135, "compression/movement_sparsity/importance_threshold": -1.0032747745265921e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9024175582467631, "compression/movement_sparsity/model_sparsity": 0.8112512247326524, "compression_loss": 10.469006538391113, "distillation_loss": 0.5176489353179932, "epoch": 4.51, "learning_rate": 5.2339181286549704e-05, "loss": 10.9226, "step": 1800, "task_loss": 0.2870589792728424 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038988689985, "compression/movement_sparsity/importance_threshold": -9.536987276066982e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9029426612842517, "compression/movement_sparsity/model_sparsity": 0.8117232794687119, "compression_loss": 10.481606483459473, "distillation_loss": 0.5399871468544006, "epoch": 4.54, "learning_rate": 5.263157894736841e-05, "loss": 10.8043, "step": 1810, "task_loss": 0.26069051027297974 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039039499835, "compression/movement_sparsity/importance_threshold": -9.057833618176148e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.903573718853508, "compression/movement_sparsity/model_sparsity": 0.8122905847269679, "compression_loss": 10.493739128112793, "distillation_loss": 0.3877241611480713, "epoch": 4.56, "learning_rate": 5.292397660818713e-05, "loss": 10.807, "step": 1820, "task_loss": 0.24742752313613892 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039088578685, "compression/movement_sparsity/importance_threshold": -8.595003861690395e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.904272821157031, "compression/movement_sparsity/model_sparsity": 0.8129190605303956, "compression_loss": 10.505409240722656, "distillation_loss": 0.25960269570350647, "epoch": 4.59, "learning_rate": 5.321637426900584e-05, "loss": 10.8415, "step": 1830, "task_loss": 0.06999611854553223 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039135956535, "compression/movement_sparsity/importance_threshold": -8.148215096706807e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9051532388587775, "compression/movement_sparsity/model_sparsity": 0.8137105344243719, "compression_loss": 10.516351699829102, "distillation_loss": 0.3734590411186218, "epoch": 4.61, "learning_rate": 5.3508771929824555e-05, "loss": 10.8276, "step": 1840, "task_loss": 0.2790951132774353 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039181663385, "compression/movement_sparsity/importance_threshold": -7.717184413322253e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9060454423554652, "compression/movement_sparsity/model_sparsity": 0.8145126034585837, "compression_loss": 10.526989936828613, "distillation_loss": 0.40441250801086426, "epoch": 4.64, "learning_rate": 5.380116959064327e-05, "loss": 10.8386, "step": 1850, "task_loss": 0.16886454820632935 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039225729235, "compression/movement_sparsity/importance_threshold": -7.301628901633709e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9065582185335742, "compression/movement_sparsity/model_sparsity": 0.8149735766507641, "compression_loss": 10.537022590637207, "distillation_loss": 0.5824317336082458, "epoch": 4.66, "learning_rate": 5.409356725146199e-05, "loss": 10.859, "step": 1860, "task_loss": 0.5143399834632874 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039268184085, "compression/movement_sparsity/importance_threshold": -6.90126565173815e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9073361398110509, "compression/movement_sparsity/model_sparsity": 0.8156729087762655, "compression_loss": 10.546780586242676, "distillation_loss": 0.5486133098602295, "epoch": 4.69, "learning_rate": 5.4385964912280694e-05, "loss": 10.8576, "step": 1870, "task_loss": 0.30259478092193604 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039309057935, "compression/movement_sparsity/importance_threshold": -6.515811753732498e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9080930419113219, "compression/movement_sparsity/model_sparsity": 0.8163533451776145, "compression_loss": 10.556109428405762, "distillation_loss": 1.0528051853179932, "epoch": 4.71, "learning_rate": 5.467836257309941e-05, "loss": 10.8712, "step": 1880, "task_loss": 0.43705570697784424 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039348380785, "compression/movement_sparsity/importance_threshold": -6.1449842977138384e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9087980606368564, "compression/movement_sparsity/model_sparsity": 0.8169871396991445, "compression_loss": 10.565235137939453, "distillation_loss": 0.3459569215774536, "epoch": 4.74, "learning_rate": 5.497076023391812e-05, "loss": 10.8912, "step": 1890, "task_loss": 0.12038382887840271 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039386182635, "compression/movement_sparsity/importance_threshold": -5.788500373778984e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.909603117472147, "compression/movement_sparsity/model_sparsity": 0.817710866024769, "compression_loss": 10.573878288269043, "distillation_loss": 0.5916445851325989, "epoch": 4.76, "learning_rate": 5.526315789473684e-05, "loss": 10.9138, "step": 1900, "task_loss": 0.37912750244140625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039422493485, "compression/movement_sparsity/importance_threshold": -5.44607707202502e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9098056637684432, "compression/movement_sparsity/model_sparsity": 0.8178929501713303, "compression_loss": 10.581986427307129, "distillation_loss": 0.70872563123703, "epoch": 4.79, "learning_rate": 5.5555555555555545e-05, "loss": 10.916, "step": 1910, "task_loss": 0.49253469705581665 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039457343335, "compression/movement_sparsity/importance_threshold": -5.117431482548813e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9102467136216501, "compression/movement_sparsity/model_sparsity": 0.8182894431587645, "compression_loss": 10.589385986328125, "distillation_loss": 0.6817111372947693, "epoch": 4.81, "learning_rate": 5.5847953216374264e-05, "loss": 10.9246, "step": 1920, "task_loss": 0.3340785503387451 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039490762185, "compression/movement_sparsity/importance_threshold": -4.802280695447394e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9103951299495634, "compression/movement_sparsity/model_sparsity": 0.8184228657929871, "compression_loss": 10.596639633178711, "distillation_loss": 0.5131574869155884, "epoch": 4.84, "learning_rate": 5.614035087719298e-05, "loss": 10.8974, "step": 1930, "task_loss": 0.22064465284347534 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039522780035, "compression/movement_sparsity/importance_threshold": -4.500341800817739e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9110764006511592, "compression/movement_sparsity/model_sparsity": 0.8190353114241624, "compression_loss": 10.603409767150879, "distillation_loss": 0.28842249512672424, "epoch": 4.86, "learning_rate": 5.643274853801169e-05, "loss": 10.9209, "step": 1940, "task_loss": 0.08667704463005066 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039553426885, "compression/movement_sparsity/importance_threshold": -4.21133188875677e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9118364903267088, "compression/movement_sparsity/model_sparsity": 0.8197186133774114, "compression_loss": 10.609901428222656, "distillation_loss": 0.42135104537010193, "epoch": 4.89, "learning_rate": 5.67251461988304e-05, "loss": 10.9254, "step": 1950, "task_loss": 0.21301895380020142 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039582732735, "compression/movement_sparsity/importance_threshold": -3.934968049361463e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9121054934507679, "compression/movement_sparsity/model_sparsity": 0.8199604405801907, "compression_loss": 10.616052627563477, "distillation_loss": 0.816530704498291, "epoch": 4.91, "learning_rate": 5.7017543859649115e-05, "loss": 10.9946, "step": 1960, "task_loss": 0.5122554302215576 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039610727585, "compression/movement_sparsity/importance_threshold": -3.670967372728794e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9126333018104487, "compression/movement_sparsity/model_sparsity": 0.8204349273344679, "compression_loss": 10.621758460998535, "distillation_loss": 0.8322756290435791, "epoch": 4.94, "learning_rate": 5.730994152046783e-05, "loss": 10.9709, "step": 1970, "task_loss": 0.23779863119125366 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039637441435000004, "compression/movement_sparsity/importance_threshold": -3.4190469489556847e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9131550996499548, "compression/movement_sparsity/model_sparsity": 0.8209040107787048, "compression_loss": 10.627117156982422, "distillation_loss": 1.0545827150344849, "epoch": 4.96, "learning_rate": 5.760233918128655e-05, "loss": 10.953, "step": 1980, "task_loss": 0.510939359664917 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039662904285, "compression/movement_sparsity/importance_threshold": -3.17892386813922e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9135569458559168, "compression/movement_sparsity/model_sparsity": 0.8212652606499664, "compression_loss": 10.632038116455078, "distillation_loss": 0.3127363920211792, "epoch": 4.99, "learning_rate": 5.7894736842105253e-05, "loss": 10.8964, "step": 1990, "task_loss": 0.22342097759246826 }, { "epoch": 5.0, "eval_accuracy": 0.9646954986760812, "eval_loss": 10.7809419631958, "eval_runtime": 86.6556, "eval_samples_per_second": 78.448, "eval_steps_per_second": 2.458, "step": 1995 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03968950432, "compression/movement_sparsity/importance_threshold": -2.928076757386603e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9139671550361337, "compression/movement_sparsity/model_sparsity": 0.8216340286297185, "compression_loss": 10.637024879455566, "distillation_loss": 0.5320035815238953, "epoch": 5.01, "learning_rate": 5.818713450292397e-05, "loss": 11.2495, "step": 2000, "task_loss": 0.23087036609649658 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03971243772, "compression/movement_sparsity/importance_threshold": -2.7118072250444255e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9145193230578139, "compression/movement_sparsity/model_sparsity": 0.8221304141219473, "compression_loss": 10.641243934631348, "distillation_loss": 0.35051363706588745, "epoch": 5.04, "learning_rate": 5.847953216374268e-05, "loss": 10.9442, "step": 2010, "task_loss": 0.327785462141037 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03973421312, "compression/movement_sparsity/importance_threshold": -2.5064580149594478e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9151376420882265, "compression/movement_sparsity/model_sparsity": 0.8226862677465956, "compression_loss": 10.645480155944824, "distillation_loss": 0.5606256723403931, "epoch": 5.06, "learning_rate": 5.87719298245614e-05, "loss": 10.9266, "step": 2020, "task_loss": 0.20908093452453613 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03975486052, "compression/movement_sparsity/importance_threshold": -2.311746217228646e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9155960177657332, "compression/movement_sparsity/model_sparsity": 0.8230983362246153, "compression_loss": 10.649575233459473, "distillation_loss": 0.3791353106498718, "epoch": 5.09, "learning_rate": 5.906432748538011e-05, "loss": 10.9635, "step": 2030, "task_loss": 0.1277802586555481 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03977440992, "compression/movement_sparsity/importance_threshold": -2.1273889219488337e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9161118286096055, "compression/movement_sparsity/model_sparsity": 0.8235620375067965, "compression_loss": 10.653324127197266, "distillation_loss": 0.5789883136749268, "epoch": 5.11, "learning_rate": 5.9356725146198824e-05, "loss": 10.9533, "step": 2040, "task_loss": 0.33982402086257935 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03979289132, "compression/movement_sparsity/importance_threshold": -1.9531032192171495e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9163111049947305, "compression/movement_sparsity/model_sparsity": 0.8237411820835121, "compression_loss": 10.656829833984375, "distillation_loss": 0.4787919521331787, "epoch": 5.14, "learning_rate": 5.964912280701754e-05, "loss": 10.9379, "step": 2050, "task_loss": 0.1822199821472168 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03981033472, "compression/movement_sparsity/importance_threshold": -1.788606199130407e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9167545896379102, "compression/movement_sparsity/model_sparsity": 0.8241398638873422, "compression_loss": 10.659506797790527, "distillation_loss": 0.22967106103897095, "epoch": 5.16, "learning_rate": 5.994152046783625e-05, "loss": 10.9419, "step": 2060, "task_loss": 0.10331019759178162 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03982677012, "compression/movement_sparsity/importance_threshold": -1.6336149517856904e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9174214515582656, "compression/movement_sparsity/model_sparsity": 0.8247393563780101, "compression_loss": 10.66145133972168, "distillation_loss": 0.4336327314376831, "epoch": 5.19, "learning_rate": 6.023391812865496e-05, "loss": 10.9723, "step": 2070, "task_loss": 0.14386022090911865 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03984222752, "compression/movement_sparsity/importance_threshold": -1.4878465672798677e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9181478658536585, "compression/movement_sparsity/model_sparsity": 0.8253923849914446, "compression_loss": 10.662842750549316, "distillation_loss": 0.4906628429889679, "epoch": 5.21, "learning_rate": 6.052631578947368e-05, "loss": 10.9974, "step": 2080, "task_loss": 0.27048027515411377 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039856736920000004, "compression/movement_sparsity/importance_threshold": -1.3510181357099691e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.919091517521078, "compression/movement_sparsity/model_sparsity": 0.826240704667762, "compression_loss": 10.664116859436035, "distillation_loss": 0.4878236651420593, "epoch": 5.24, "learning_rate": 6.081871345029239e-05, "loss": 10.9883, "step": 2090, "task_loss": 0.28648656606674194 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03987032832, "compression/movement_sparsity/importance_threshold": -1.2228467471729707e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9201948537714544, "compression/movement_sparsity/model_sparsity": 0.8272325768628783, "compression_loss": 10.664933204650879, "distillation_loss": 0.8386104106903076, "epoch": 5.26, "learning_rate": 6.111111111111111e-05, "loss": 10.9988, "step": 2100, "task_loss": 0.48160696029663086 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03988303172, "compression/movement_sparsity/importance_threshold": -1.1030494917657944e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9208408611863896, "compression/movement_sparsity/model_sparsity": 0.8278133216652853, "compression_loss": 10.665301322937012, "distillation_loss": 0.33076217770576477, "epoch": 5.29, "learning_rate": 6.140350877192981e-05, "loss": 11.017, "step": 2110, "task_loss": 0.15396958589553833 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03989487712, "compression/movement_sparsity/importance_threshold": -9.913434595854704e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9214929379328516, "compression/movement_sparsity/model_sparsity": 0.8283995226476938, "compression_loss": 10.665474891662598, "distillation_loss": 0.3862035870552063, "epoch": 5.31, "learning_rate": 6.169590643274853e-05, "loss": 11.0204, "step": 2120, "task_loss": 0.1747991442680359 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03990589452, "compression/movement_sparsity/importance_threshold": -8.874457407288122e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9218310679200542, "compression/movement_sparsity/model_sparsity": 0.828703493202932, "compression_loss": 10.665908813476562, "distillation_loss": 0.6665565967559814, "epoch": 5.34, "learning_rate": 6.198830409356724e-05, "loss": 11.0217, "step": 2130, "task_loss": 0.2977946698665619 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039916113919999996, "compression/movement_sparsity/importance_threshold": -7.910734252930128e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9219254954268292, "compression/movement_sparsity/model_sparsity": 0.8287883812127227, "compression_loss": 10.666594505310059, "distillation_loss": 0.5244303941726685, "epoch": 5.36, "learning_rate": 6.228070175438596e-05, "loss": 11.0024, "step": 2140, "task_loss": 0.29131343960762024 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03992556532, "compression/movement_sparsity/importance_threshold": -7.019436033747772e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9223934573547125, "compression/movement_sparsity/model_sparsity": 0.829209067494427, "compression_loss": 10.667391777038574, "distillation_loss": 0.7449897527694702, "epoch": 5.39, "learning_rate": 6.257309941520466e-05, "loss": 10.9758, "step": 2150, "task_loss": 0.40607041120529175 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03993427872, "compression/movement_sparsity/importance_threshold": -6.197733650712984e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9226109417344174, "compression/movement_sparsity/model_sparsity": 0.8294045806111472, "compression_loss": 10.667990684509277, "distillation_loss": 0.7472751140594482, "epoch": 5.41, "learning_rate": 6.286549707602338e-05, "loss": 10.9848, "step": 2160, "task_loss": 0.38769587874412537 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03994228412, "compression/movement_sparsity/importance_threshold": -5.442798004793897e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9229368624849443, "compression/movement_sparsity/model_sparsity": 0.8296975753624288, "compression_loss": 10.668244361877441, "distillation_loss": 0.49155575037002563, "epoch": 5.44, "learning_rate": 6.315789473684209e-05, "loss": 11.0034, "step": 2170, "task_loss": 0.18195515871047974 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039949611520000004, "compression/movement_sparsity/importance_threshold": -4.751799996960816e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9234349711118639, "compression/movement_sparsity/model_sparsity": 0.8301453627862724, "compression_loss": 10.6676607131958, "distillation_loss": 0.2057265192270279, "epoch": 5.46, "learning_rate": 6.345029239766081e-05, "loss": 10.9875, "step": 2180, "task_loss": 0.06234532594680786 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039956290920000005, "compression/movement_sparsity/importance_threshold": -4.121910528183501e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9240599828741343, "compression/movement_sparsity/model_sparsity": 0.8307072330125114, "compression_loss": 10.666702270507812, "distillation_loss": 0.47685402631759644, "epoch": 5.49, "learning_rate": 6.374269005847953e-05, "loss": 10.9471, "step": 2190, "task_loss": 0.41978389024734497 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03996235232, "compression/movement_sparsity/importance_threshold": -3.55030049943117e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9245668543925023, "compression/movement_sparsity/model_sparsity": 0.8311628980605817, "compression_loss": 10.665879249572754, "distillation_loss": 0.6600457429885864, "epoch": 5.51, "learning_rate": 6.403508771929823e-05, "loss": 11.0211, "step": 2200, "task_loss": 0.3570174276828766 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03996782572, "compression/movement_sparsity/importance_threshold": -3.0341408116741275e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9246335817524842, "compression/movement_sparsity/model_sparsity": 0.8312228843186215, "compression_loss": 10.665406227111816, "distillation_loss": 0.5320706367492676, "epoch": 5.54, "learning_rate": 6.432748538011695e-05, "loss": 10.9993, "step": 2210, "task_loss": 0.2885274887084961 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03997274112, "compression/movement_sparsity/importance_threshold": -2.5706023658810494e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.924920463527552, "compression/movement_sparsity/model_sparsity": 0.8314807839896222, "compression_loss": 10.664754867553711, "distillation_loss": 0.17137180268764496, "epoch": 5.56, "learning_rate": 6.461988304093567e-05, "loss": 10.9232, "step": 2220, "task_loss": 0.049120813608169556 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03997712852, "compression/movement_sparsity/importance_threshold": -2.156856063022781e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9257162869805782, "compression/movement_sparsity/model_sparsity": 0.8321962097313297, "compression_loss": 10.663785934448242, "distillation_loss": 0.4174395203590393, "epoch": 5.59, "learning_rate": 6.491228070175438e-05, "loss": 11.0043, "step": 2230, "task_loss": 0.13315129280090332 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03998101792, "compression/movement_sparsity/importance_threshold": -1.790072804067999e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9262508115966577, "compression/movement_sparsity/model_sparsity": 0.8326767342351821, "compression_loss": 10.662391662597656, "distillation_loss": 0.4411344826221466, "epoch": 5.61, "learning_rate": 6.52046783625731e-05, "loss": 10.9919, "step": 2240, "task_loss": 0.1832524538040161 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03998443932, "compression/movement_sparsity/importance_threshold": -1.4674234899864644e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9265289775293586, "compression/movement_sparsity/model_sparsity": 0.832926798577925, "compression_loss": 10.661162376403809, "distillation_loss": 1.028182864189148, "epoch": 5.64, "learning_rate": 6.54970760233918e-05, "loss": 11.0257, "step": 2250, "task_loss": 0.5844765901565552 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03998742272, "compression/movement_sparsity/importance_threshold": -1.1860790217484802e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9265480441696778, "compression/movement_sparsity/model_sparsity": 0.8329439390193637, "compression_loss": 10.659642219543457, "distillation_loss": 0.3779850900173187, "epoch": 5.66, "learning_rate": 6.578947368421052e-05, "loss": 11.0075, "step": 2260, "task_loss": 0.11443006992340088 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03998999812, "compression/movement_sparsity/importance_threshold": -9.432103003227232e-08, "compression/movement_sparsity/linear_layer_sparsity": 0.9268287154659741, "compression/movement_sparsity/model_sparsity": 0.833196255622456, "compression_loss": 10.657720565795898, "distillation_loss": 0.4198092222213745, "epoch": 5.69, "learning_rate": 6.608187134502923e-05, "loss": 10.9443, "step": 2270, "task_loss": 0.11462461948394775 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999219552, "compression/movement_sparsity/importance_threshold": -7.359882266794963e-08, "compression/movement_sparsity/linear_layer_sparsity": 0.9270981773185787, "compression/movement_sparsity/model_sparsity": 0.8334384952109332, "compression_loss": 10.655531883239746, "distillation_loss": 0.9664290547370911, "epoch": 5.71, "learning_rate": 6.637426900584795e-05, "loss": 10.9229, "step": 2280, "task_loss": 0.5996133685112 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999404492, "compression/movement_sparsity/importance_threshold": -5.615837017885605e-08, "compression/movement_sparsity/linear_layer_sparsity": 0.9276652128500452, "compression/movement_sparsity/model_sparsity": 0.8339482462293673, "compression_loss": 10.65322208404541, "distillation_loss": 0.7060877680778503, "epoch": 5.74, "learning_rate": 6.666666666666666e-05, "loss": 10.9933, "step": 2290, "task_loss": 0.48744356632232666 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039995576320000004, "compression/movement_sparsity/importance_threshold": -4.171676266191346e-08, "compression/movement_sparsity/linear_layer_sparsity": 0.9279948151912075, "compression/movement_sparsity/model_sparsity": 0.8342445506402235, "compression_loss": 10.65136432647705, "distillation_loss": 0.8349236249923706, "epoch": 5.76, "learning_rate": 6.695906432748538e-05, "loss": 10.975, "step": 2300, "task_loss": 0.5106658935546875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999681972, "compression/movement_sparsity/importance_threshold": -2.9991090214152166e-08, "compression/movement_sparsity/linear_layer_sparsity": 0.9282819674984945, "compression/movement_sparsity/model_sparsity": 0.8345026935130461, "compression_loss": 10.649052619934082, "distillation_loss": 0.4618738889694214, "epoch": 5.79, "learning_rate": 6.725146198830408e-05, "loss": 10.9074, "step": 2310, "task_loss": 0.23405304551124573 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999780512, "compression/movement_sparsity/importance_threshold": -2.069844293243984e-08, "compression/movement_sparsity/linear_layer_sparsity": 0.9287675140206263, "compression/movement_sparsity/model_sparsity": 0.8349391879131656, "compression_loss": 10.646767616271973, "distillation_loss": 0.7326123118400574, "epoch": 5.81, "learning_rate": 6.75438596491228e-05, "loss": 10.9723, "step": 2320, "task_loss": 0.3694491982460022 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039998562519999996, "compression/movement_sparsity/importance_threshold": -1.3555910913860998e-08, "compression/movement_sparsity/linear_layer_sparsity": 0.9291809695874736, "compression/movement_sparsity/model_sparsity": 0.8353108743147789, "compression_loss": 10.644407272338867, "distillation_loss": 0.7847580909729004, "epoch": 5.84, "learning_rate": 6.783625730994151e-05, "loss": 10.9854, "step": 2330, "task_loss": 0.6249511241912842 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999912192, "compression/movement_sparsity/importance_threshold": -8.280584255229099e-09, "compression/movement_sparsity/linear_layer_sparsity": 0.9294438563309244, "compression/movement_sparsity/model_sparsity": 0.8355472030415877, "compression_loss": 10.642403602600098, "distillation_loss": 0.7005610466003418, "epoch": 5.86, "learning_rate": 6.812865497076023e-05, "loss": 10.948, "step": 2340, "task_loss": 0.4554665982723236 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039999513320000006, "compression/movement_sparsity/importance_threshold": -4.589553053628657e-09, "compression/movement_sparsity/linear_layer_sparsity": 0.9299143824337549, "compression/movement_sparsity/model_sparsity": 0.8359701944536027, "compression_loss": 10.640273094177246, "distillation_loss": 0.43438440561294556, "epoch": 5.89, "learning_rate": 6.842105263157893e-05, "loss": 10.9744, "step": 2350, "task_loss": 0.22005879878997803 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039999766719999996, "compression/movement_sparsity/importance_threshold": -2.199907406035765e-09, "compression/movement_sparsity/linear_layer_sparsity": 0.9300114329268293, "compression/movement_sparsity/model_sparsity": 0.8360574404636654, "compression_loss": 10.638136863708496, "distillation_loss": 0.37377381324768066, "epoch": 5.91, "learning_rate": 6.871345029239765e-05, "loss": 10.9263, "step": 2360, "task_loss": 0.3662358522415161 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999991212, "compression/movement_sparsity/importance_threshold": -8.287374093180963e-10, "compression/movement_sparsity/linear_layer_sparsity": 0.9300814890093345, "compression/movement_sparsity/model_sparsity": 0.836120419161512, "compression_loss": 10.635852813720703, "distillation_loss": 0.3352665603160858, "epoch": 5.94, "learning_rate": 6.900584795321636e-05, "loss": 10.9481, "step": 2370, "task_loss": 0.12698450684547424 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999997952, "compression/movement_sparsity/importance_threshold": -1.9313316045174375e-10, "compression/movement_sparsity/linear_layer_sparsity": 0.9305880076595905, "compression/movement_sparsity/model_sparsity": 0.836575766989815, "compression_loss": 10.633511543273926, "distillation_loss": 0.5461183190345764, "epoch": 5.96, "learning_rate": 6.929824561403508e-05, "loss": 10.9881, "step": 2380, "task_loss": 0.2637324929237366 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999999892, "compression/movement_sparsity/importance_threshold": -1.0184756521220556e-11, "compression/movement_sparsity/linear_layer_sparsity": 0.9312243817750677, "compression/movement_sparsity/model_sparsity": 0.8371478516925683, "compression_loss": 10.631221771240234, "distillation_loss": 0.5138255953788757, "epoch": 5.99, "learning_rate": 6.95906432748538e-05, "loss": 10.9322, "step": 2390, "task_loss": 0.2347862720489502 }, { "epoch": 6.0, "eval_accuracy": 0.9619005589879376, "eval_loss": 10.780640602111816, "eval_runtime": 132.1966, "eval_samples_per_second": 51.423, "eval_steps_per_second": 1.611, "step": 2394 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.7983419895172119, "epoch": 6.02, "learning_rate": 6.985380116959064e-05, "loss": 4.89, "step": 2400, "task_loss": 0.48241502046585083 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.9250924587249756, "epoch": 6.04, "learning_rate": 6.985380116959064e-05, "loss": 0.3853, "step": 2410, "task_loss": 0.5363513231277466 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 1.459159016609192, "epoch": 6.07, "learning_rate": 6.956140350877192e-05, "loss": 0.4553, "step": 2420, "task_loss": 1.2184821367263794 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5406507253646851, "epoch": 6.09, "learning_rate": 6.92690058479532e-05, "loss": 0.4053, "step": 2430, "task_loss": 0.29688769578933716 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.6764876842498779, "epoch": 6.12, "learning_rate": 6.89766081871345e-05, "loss": 0.3763, "step": 2440, "task_loss": 0.7804003953933716 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3237318694591522, "epoch": 6.14, "learning_rate": 6.868421052631578e-05, "loss": 0.4146, "step": 2450, "task_loss": 0.24066579341888428 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.500560462474823, "epoch": 6.17, "learning_rate": 6.839181286549707e-05, "loss": 0.3607, "step": 2460, "task_loss": 0.3094533681869507 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5021952390670776, "epoch": 6.19, "learning_rate": 6.809941520467835e-05, "loss": 0.3754, "step": 2470, "task_loss": 0.18106400966644287 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.9087671041488647, "epoch": 6.22, "learning_rate": 6.780701754385964e-05, "loss": 0.3213, "step": 2480, "task_loss": 0.3978729248046875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3499097228050232, "epoch": 6.24, "learning_rate": 6.751461988304093e-05, "loss": 0.3437, "step": 2490, "task_loss": 0.2311665415763855 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.37648361921310425, "epoch": 6.27, "learning_rate": 6.722222222222222e-05, "loss": 0.3357, "step": 2500, "task_loss": 0.22226202487945557 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.7497131824493408, "epoch": 6.29, "learning_rate": 6.69298245614035e-05, "loss": 0.3155, "step": 2510, "task_loss": 0.48474282026290894 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2494376003742218, "epoch": 6.32, "learning_rate": 6.66374269005848e-05, "loss": 0.3078, "step": 2520, "task_loss": 0.08828747272491455 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.671501874923706, "epoch": 6.34, "learning_rate": 6.634502923976607e-05, "loss": 0.3398, "step": 2530, "task_loss": 0.5209366679191589 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.33928757905960083, "epoch": 6.37, "learning_rate": 6.605263157894737e-05, "loss": 0.2388, "step": 2540, "task_loss": 0.07023358345031738 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.7021492719650269, "epoch": 6.39, "learning_rate": 6.576023391812865e-05, "loss": 0.261, "step": 2550, "task_loss": 0.38910675048828125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.46911415457725525, "epoch": 6.42, "learning_rate": 6.546783625730994e-05, "loss": 0.2949, "step": 2560, "task_loss": 0.2855373024940491 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4447360336780548, "epoch": 6.44, "learning_rate": 6.517543859649122e-05, "loss": 0.3047, "step": 2570, "task_loss": 0.1770896315574646 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4246532618999481, "epoch": 6.47, "learning_rate": 6.488304093567252e-05, "loss": 0.2574, "step": 2580, "task_loss": 0.13081282377243042 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2699139416217804, "epoch": 6.49, "learning_rate": 6.45906432748538e-05, "loss": 0.2827, "step": 2590, "task_loss": 0.11969354748725891 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.1465551257133484, "epoch": 6.52, "learning_rate": 6.429824561403508e-05, "loss": 0.255, "step": 2600, "task_loss": 0.04637971520423889 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.38055044412612915, "epoch": 6.54, "learning_rate": 6.400584795321637e-05, "loss": 0.2831, "step": 2610, "task_loss": 0.25821614265441895 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.6416689157485962, "epoch": 6.57, "learning_rate": 6.371345029239765e-05, "loss": 0.2951, "step": 2620, "task_loss": 0.21043294668197632 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.9269071221351624, "epoch": 6.59, "learning_rate": 6.342105263157895e-05, "loss": 0.3323, "step": 2630, "task_loss": 0.5026609301567078 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.6682649850845337, "epoch": 6.62, "learning_rate": 6.312865497076023e-05, "loss": 0.301, "step": 2640, "task_loss": 0.4854172170162201 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.15332600474357605, "epoch": 6.64, "learning_rate": 6.283625730994151e-05, "loss": 0.2931, "step": 2650, "task_loss": 0.07616248726844788 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.6017502546310425, "epoch": 6.67, "learning_rate": 6.25438596491228e-05, "loss": 0.3101, "step": 2660, "task_loss": 0.24977391958236694 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.49376827478408813, "epoch": 6.69, "learning_rate": 6.225146198830408e-05, "loss": 0.2862, "step": 2670, "task_loss": 0.2650887072086334 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5746809244155884, "epoch": 6.72, "learning_rate": 6.195906432748538e-05, "loss": 0.2362, "step": 2680, "task_loss": 0.3955618739128113 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.1680450141429901, "epoch": 6.74, "learning_rate": 6.166666666666666e-05, "loss": 0.2752, "step": 2690, "task_loss": 0.05644118785858154 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.267273485660553, "epoch": 6.77, "learning_rate": 6.137426900584795e-05, "loss": 0.2803, "step": 2700, "task_loss": 0.13188397884368896 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.40438371896743774, "epoch": 6.79, "learning_rate": 6.108187134502923e-05, "loss": 0.3032, "step": 2710, "task_loss": 0.1812323033809662 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.35020095109939575, "epoch": 6.82, "learning_rate": 6.0789473684210525e-05, "loss": 0.2925, "step": 2720, "task_loss": 0.12603074312210083 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.17570018768310547, "epoch": 6.84, "learning_rate": 6.0497076023391806e-05, "loss": 0.2099, "step": 2730, "task_loss": 0.021570265293121338 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.9918994903564453, "epoch": 6.87, "learning_rate": 6.02046783625731e-05, "loss": 0.2642, "step": 2740, "task_loss": 0.6132215261459351 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.351323664188385, "epoch": 6.89, "learning_rate": 5.991228070175438e-05, "loss": 0.2622, "step": 2750, "task_loss": 0.13521471619606018 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2853711247444153, "epoch": 6.92, "learning_rate": 5.961988304093567e-05, "loss": 0.3161, "step": 2760, "task_loss": 0.17914897203445435 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4238589107990265, "epoch": 6.94, "learning_rate": 5.9327485380116955e-05, "loss": 0.2448, "step": 2770, "task_loss": 0.25172996520996094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.557287335395813, "epoch": 6.97, "learning_rate": 5.903508771929824e-05, "loss": 0.294, "step": 2780, "task_loss": 0.25702038407325745 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5326415300369263, "epoch": 6.99, "learning_rate": 5.874269005847952e-05, "loss": 0.2389, "step": 2790, "task_loss": 0.37412598729133606 }, { "epoch": 7.0, "eval_accuracy": 0.9738158281847602, "eval_loss": 0.114750936627388, "eval_runtime": 88.7864, "eval_samples_per_second": 76.566, "eval_steps_per_second": 2.399, "step": 2793 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4145723879337311, "epoch": 7.02, "learning_rate": 5.8450292397660816e-05, "loss": 0.292, "step": 2800, "task_loss": 0.2053484320640564 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5730660557746887, "epoch": 7.04, "learning_rate": 5.81578947368421e-05, "loss": 0.2816, "step": 2810, "task_loss": 0.3073492646217346 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.22465837001800537, "epoch": 7.07, "learning_rate": 5.786549707602339e-05, "loss": 0.2321, "step": 2820, "task_loss": 0.083594411611557 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.38710612058639526, "epoch": 7.09, "learning_rate": 5.757309941520467e-05, "loss": 0.2502, "step": 2830, "task_loss": 0.22288838028907776 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4200671911239624, "epoch": 7.12, "learning_rate": 5.7280701754385965e-05, "loss": 0.2209, "step": 2840, "task_loss": 0.18464124202728271 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5526059865951538, "epoch": 7.14, "learning_rate": 5.6988304093567246e-05, "loss": 0.2381, "step": 2850, "task_loss": 0.27178797125816345 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.8992462158203125, "epoch": 7.17, "learning_rate": 5.669590643274853e-05, "loss": 0.2418, "step": 2860, "task_loss": 0.7048245668411255 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4709673821926117, "epoch": 7.19, "learning_rate": 5.640350877192982e-05, "loss": 0.2539, "step": 2870, "task_loss": 0.2595267593860626 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4535144865512848, "epoch": 7.22, "learning_rate": 5.611111111111111e-05, "loss": 0.2421, "step": 2880, "task_loss": 0.19773799180984497 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5013654232025146, "epoch": 7.24, "learning_rate": 5.5818713450292395e-05, "loss": 0.249, "step": 2890, "task_loss": 0.19651299715042114 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 1.160736322402954, "epoch": 7.27, "learning_rate": 5.552631578947368e-05, "loss": 0.2468, "step": 2900, "task_loss": 0.7461255192756653 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2139417827129364, "epoch": 7.29, "learning_rate": 5.523391812865496e-05, "loss": 0.2224, "step": 2910, "task_loss": 0.07157644629478455 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.6396937966346741, "epoch": 7.32, "learning_rate": 5.494152046783626e-05, "loss": 0.2347, "step": 2920, "task_loss": 0.519213080406189 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.27051806449890137, "epoch": 7.34, "learning_rate": 5.464912280701754e-05, "loss": 0.2581, "step": 2930, "task_loss": 0.06110638380050659 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2813902497291565, "epoch": 7.37, "learning_rate": 5.435672514619883e-05, "loss": 0.2618, "step": 2940, "task_loss": 0.2888513207435608 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.6568804979324341, "epoch": 7.39, "learning_rate": 5.406432748538011e-05, "loss": 0.2664, "step": 2950, "task_loss": 0.4726387858390808 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3869742453098297, "epoch": 7.42, "learning_rate": 5.37719298245614e-05, "loss": 0.2588, "step": 2960, "task_loss": 0.3070613741874695 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3587968349456787, "epoch": 7.44, "learning_rate": 5.3479532163742686e-05, "loss": 0.2533, "step": 2970, "task_loss": 0.12952539324760437 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.20623968541622162, "epoch": 7.47, "learning_rate": 5.3187134502923973e-05, "loss": 0.2429, "step": 2980, "task_loss": 0.050109267234802246 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3525960445404053, "epoch": 7.49, "learning_rate": 5.289473684210526e-05, "loss": 0.2101, "step": 2990, "task_loss": 0.15648704767227173 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5383965373039246, "epoch": 7.52, "learning_rate": 5.260233918128655e-05, "loss": 0.2462, "step": 3000, "task_loss": 0.2703443765640259 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.24444815516471863, "epoch": 7.54, "learning_rate": 5.230994152046783e-05, "loss": 0.2791, "step": 3010, "task_loss": 0.074734628200531 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.8547993898391724, "epoch": 7.57, "learning_rate": 5.201754385964912e-05, "loss": 0.2289, "step": 3020, "task_loss": 0.37249431014060974 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.41006985306739807, "epoch": 7.59, "learning_rate": 5.17251461988304e-05, "loss": 0.2392, "step": 3030, "task_loss": 0.18442684412002563 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.15326108038425446, "epoch": 7.62, "learning_rate": 5.14327485380117e-05, "loss": 0.2821, "step": 3040, "task_loss": 0.2564823031425476 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5139175653457642, "epoch": 7.64, "learning_rate": 5.114035087719298e-05, "loss": 0.2611, "step": 3050, "task_loss": 0.3527417778968811 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.36897873878479004, "epoch": 7.67, "learning_rate": 5.0847953216374265e-05, "loss": 0.2883, "step": 3060, "task_loss": 0.4354107975959778 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.24441149830818176, "epoch": 7.69, "learning_rate": 5.055555555555555e-05, "loss": 0.2414, "step": 3070, "task_loss": 0.05061835050582886 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4979506731033325, "epoch": 7.72, "learning_rate": 5.026315789473684e-05, "loss": 0.2641, "step": 3080, "task_loss": 0.28660231828689575 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.23474541306495667, "epoch": 7.74, "learning_rate": 4.9970760233918126e-05, "loss": 0.263, "step": 3090, "task_loss": 0.08172953128814697 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3658486604690552, "epoch": 7.77, "learning_rate": 4.9678362573099414e-05, "loss": 0.2276, "step": 3100, "task_loss": 0.06395676732063293 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.6114939451217651, "epoch": 7.79, "learning_rate": 4.9385964912280694e-05, "loss": 0.2165, "step": 3110, "task_loss": 0.21138450503349304 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.42987892031669617, "epoch": 7.82, "learning_rate": 4.909356725146199e-05, "loss": 0.2587, "step": 3120, "task_loss": 0.20042163133621216 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.34050965309143066, "epoch": 7.84, "learning_rate": 4.880116959064327e-05, "loss": 0.2114, "step": 3130, "task_loss": 0.15165340900421143 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2943706512451172, "epoch": 7.87, "learning_rate": 4.850877192982456e-05, "loss": 0.217, "step": 3140, "task_loss": 0.12199932336807251 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5021252632141113, "epoch": 7.89, "learning_rate": 4.821637426900584e-05, "loss": 0.2315, "step": 3150, "task_loss": 0.12248951196670532 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.23276600241661072, "epoch": 7.92, "learning_rate": 4.792397660818713e-05, "loss": 0.195, "step": 3160, "task_loss": 0.04651379585266113 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2711644470691681, "epoch": 7.94, "learning_rate": 4.763157894736842e-05, "loss": 0.2542, "step": 3170, "task_loss": 0.2954481244087219 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.6499089598655701, "epoch": 7.97, "learning_rate": 4.7339181286549705e-05, "loss": 0.2303, "step": 3180, "task_loss": 0.2569674849510193 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.26821741461753845, "epoch": 7.99, "learning_rate": 4.704678362573099e-05, "loss": 0.2522, "step": 3190, "task_loss": 0.07824259996414185 }, { "epoch": 8.0, "eval_accuracy": 0.9746984407178582, "eval_loss": 0.10132193565368652, "eval_runtime": 90.0245, "eval_samples_per_second": 75.513, "eval_steps_per_second": 2.366, "step": 3192 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5839666128158569, "epoch": 8.02, "learning_rate": 4.675438596491228e-05, "loss": 0.2727, "step": 3200, "task_loss": 0.2630302906036377 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.6127955317497253, "epoch": 8.05, "learning_rate": 4.646198830409356e-05, "loss": 0.2301, "step": 3210, "task_loss": 0.3263406455516815 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.16742965579032898, "epoch": 8.07, "learning_rate": 4.6169590643274854e-05, "loss": 0.2284, "step": 3220, "task_loss": 0.033839792013168335 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.34027671813964844, "epoch": 8.1, "learning_rate": 4.5877192982456134e-05, "loss": 0.2362, "step": 3230, "task_loss": 0.0935179591178894 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.44616079330444336, "epoch": 8.12, "learning_rate": 4.558479532163743e-05, "loss": 0.2615, "step": 3240, "task_loss": 0.1582067608833313 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3368973135948181, "epoch": 8.15, "learning_rate": 4.529239766081871e-05, "loss": 0.2173, "step": 3250, "task_loss": 0.11782985925674438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.325298547744751, "epoch": 8.17, "learning_rate": 4.4999999999999996e-05, "loss": 0.2466, "step": 3260, "task_loss": 0.2142500877380371 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5337823629379272, "epoch": 8.2, "learning_rate": 4.470760233918128e-05, "loss": 0.2397, "step": 3270, "task_loss": 0.41766586899757385 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.8093789219856262, "epoch": 8.22, "learning_rate": 4.441520467836257e-05, "loss": 0.2337, "step": 3280, "task_loss": 0.43245404958724976 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3988366723060608, "epoch": 8.25, "learning_rate": 4.412280701754386e-05, "loss": 0.2029, "step": 3290, "task_loss": 0.3944489657878876 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.193750262260437, "epoch": 8.27, "learning_rate": 4.3830409356725145e-05, "loss": 0.2072, "step": 3300, "task_loss": 0.13878530263900757 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4989054203033447, "epoch": 8.3, "learning_rate": 4.3538011695906426e-05, "loss": 0.2374, "step": 3310, "task_loss": 0.35233354568481445 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.47747933864593506, "epoch": 8.32, "learning_rate": 4.324561403508772e-05, "loss": 0.194, "step": 3320, "task_loss": 0.23924636840820312 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5551918745040894, "epoch": 8.35, "learning_rate": 4.2953216374269e-05, "loss": 0.177, "step": 3330, "task_loss": 0.3308478593826294 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3261679410934448, "epoch": 8.37, "learning_rate": 4.2660818713450294e-05, "loss": 0.227, "step": 3340, "task_loss": 0.18861651420593262 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.13686245679855347, "epoch": 8.4, "learning_rate": 4.2368421052631575e-05, "loss": 0.2417, "step": 3350, "task_loss": 0.10923665761947632 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2956581115722656, "epoch": 8.42, "learning_rate": 4.207602339181287e-05, "loss": 0.2169, "step": 3360, "task_loss": 0.08555316925048828 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.419700562953949, "epoch": 8.45, "learning_rate": 4.178362573099415e-05, "loss": 0.1994, "step": 3370, "task_loss": 0.14956381916999817 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.21010534465312958, "epoch": 8.47, "learning_rate": 4.1491228070175436e-05, "loss": 0.2124, "step": 3380, "task_loss": 0.10485100746154785 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.26641708612442017, "epoch": 8.5, "learning_rate": 4.1198830409356724e-05, "loss": 0.2463, "step": 3390, "task_loss": 0.08828288316726685 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4014192819595337, "epoch": 8.52, "learning_rate": 4.090643274853801e-05, "loss": 0.2073, "step": 3400, "task_loss": 0.16874057054519653 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4832579791545868, "epoch": 8.55, "learning_rate": 4.061403508771929e-05, "loss": 0.2369, "step": 3410, "task_loss": 0.3887079060077667 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.30582836270332336, "epoch": 8.57, "learning_rate": 4.0321637426900585e-05, "loss": 0.2327, "step": 3420, "task_loss": 0.2814703583717346 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.6271736025810242, "epoch": 8.6, "learning_rate": 4.0029239766081866e-05, "loss": 0.2156, "step": 3430, "task_loss": 0.34107378125190735 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.37824779748916626, "epoch": 8.62, "learning_rate": 3.973684210526316e-05, "loss": 0.2529, "step": 3440, "task_loss": 0.20320969820022583 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.34715747833251953, "epoch": 8.65, "learning_rate": 3.944444444444444e-05, "loss": 0.226, "step": 3450, "task_loss": 0.23775744438171387 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.48077207803726196, "epoch": 8.67, "learning_rate": 3.9152046783625734e-05, "loss": 0.2313, "step": 3460, "task_loss": 0.23516058921813965 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.48644116520881653, "epoch": 8.7, "learning_rate": 3.8859649122807015e-05, "loss": 0.1951, "step": 3470, "task_loss": 0.27591028809547424 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.30401456356048584, "epoch": 8.72, "learning_rate": 3.85672514619883e-05, "loss": 0.2079, "step": 3480, "task_loss": 0.19322288036346436 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.08988785743713379, "epoch": 8.75, "learning_rate": 3.827485380116959e-05, "loss": 0.2213, "step": 3490, "task_loss": 0.25623592734336853 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.15177589654922485, "epoch": 8.77, "learning_rate": 3.7982456140350876e-05, "loss": 0.2061, "step": 3500, "task_loss": 0.15864485502243042 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.34355148673057556, "epoch": 8.8, "learning_rate": 3.769005847953216e-05, "loss": 0.2517, "step": 3510, "task_loss": 0.140297532081604 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.7358225584030151, "epoch": 8.82, "learning_rate": 3.739766081871345e-05, "loss": 0.2746, "step": 3520, "task_loss": 0.26640784740448 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4418618083000183, "epoch": 8.85, "learning_rate": 3.710526315789473e-05, "loss": 0.2479, "step": 3530, "task_loss": 0.22608336806297302 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5399448275566101, "epoch": 8.87, "learning_rate": 3.6812865497076025e-05, "loss": 0.1936, "step": 3540, "task_loss": 0.28722792863845825 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.09100654721260071, "epoch": 8.9, "learning_rate": 3.6520467836257306e-05, "loss": 0.2135, "step": 3550, "task_loss": 0.15959030389785767 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.11753946542739868, "epoch": 8.92, "learning_rate": 3.62280701754386e-05, "loss": 0.1941, "step": 3560, "task_loss": 0.036947041749954224 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4342065751552582, "epoch": 8.95, "learning_rate": 3.593567251461988e-05, "loss": 0.2682, "step": 3570, "task_loss": 0.2372136116027832 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.38562774658203125, "epoch": 8.97, "learning_rate": 3.564327485380117e-05, "loss": 0.2453, "step": 3580, "task_loss": 0.3832007050514221 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2638232707977295, "epoch": 9.0, "learning_rate": 3.5350877192982455e-05, "loss": 0.2213, "step": 3590, "task_loss": 0.0510326623916626 }, { "epoch": 9.0, "eval_accuracy": 0.9754339511621065, "eval_loss": 0.09834744036197662, "eval_runtime": 88.3371, "eval_samples_per_second": 76.955, "eval_steps_per_second": 2.411, "step": 3591 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.6044875383377075, "epoch": 9.02, "learning_rate": 3.505847953216374e-05, "loss": 0.2061, "step": 3600, "task_loss": 0.29902324080467224 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.48518362641334534, "epoch": 9.05, "learning_rate": 3.476608187134503e-05, "loss": 0.1969, "step": 3610, "task_loss": 0.1398446261882782 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5704216957092285, "epoch": 9.07, "learning_rate": 3.447368421052631e-05, "loss": 0.1819, "step": 3620, "task_loss": 0.2688668966293335 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2637028098106384, "epoch": 9.1, "learning_rate": 3.41812865497076e-05, "loss": 0.23, "step": 3630, "task_loss": 0.28141531348228455 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.34086185693740845, "epoch": 9.12, "learning_rate": 3.3888888888888884e-05, "loss": 0.2153, "step": 3640, "task_loss": 0.3492887616157532 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.22228632867336273, "epoch": 9.15, "learning_rate": 3.359649122807017e-05, "loss": 0.2178, "step": 3650, "task_loss": 0.24493113160133362 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.31930631399154663, "epoch": 9.17, "learning_rate": 3.330409356725146e-05, "loss": 0.2436, "step": 3660, "task_loss": 0.21123933792114258 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2261628806591034, "epoch": 9.2, "learning_rate": 3.3011695906432746e-05, "loss": 0.2135, "step": 3670, "task_loss": 0.05479854345321655 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.20525804162025452, "epoch": 9.22, "learning_rate": 3.2719298245614033e-05, "loss": 0.1785, "step": 3680, "task_loss": 0.15875384211540222 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.17571699619293213, "epoch": 9.25, "learning_rate": 3.242690058479532e-05, "loss": 0.2115, "step": 3690, "task_loss": 0.05111098289489746 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.07689700275659561, "epoch": 9.27, "learning_rate": 3.213450292397661e-05, "loss": 0.2062, "step": 3700, "task_loss": 0.21406108140945435 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.05007719248533249, "epoch": 9.3, "learning_rate": 3.1842105263157895e-05, "loss": 0.2535, "step": 3710, "task_loss": 0.1095457673072815 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3887738585472107, "epoch": 9.32, "learning_rate": 3.1549707602339176e-05, "loss": 0.2004, "step": 3720, "task_loss": 0.20141488313674927 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2081771194934845, "epoch": 9.35, "learning_rate": 3.125730994152046e-05, "loss": 0.1698, "step": 3730, "task_loss": 0.06815099716186523 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.13659320771694183, "epoch": 9.37, "learning_rate": 3.096491228070175e-05, "loss": 0.1826, "step": 3740, "task_loss": 0.049779534339904785 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.46494409441947937, "epoch": 9.4, "learning_rate": 3.067251461988304e-05, "loss": 0.2115, "step": 3750, "task_loss": 0.26856356859207153 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.11897766590118408, "epoch": 9.42, "learning_rate": 3.0380116959064325e-05, "loss": 0.1778, "step": 3760, "task_loss": 0.02882954478263855 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.31966108083724976, "epoch": 9.45, "learning_rate": 3.0087719298245612e-05, "loss": 0.2114, "step": 3770, "task_loss": 0.30511146783828735 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.38032066822052, "epoch": 9.47, "learning_rate": 2.97953216374269e-05, "loss": 0.197, "step": 3780, "task_loss": 0.19982492923736572 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2971644699573517, "epoch": 9.5, "learning_rate": 2.9502923976608186e-05, "loss": 0.1968, "step": 3790, "task_loss": 0.1379544734954834 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.30849239230155945, "epoch": 9.52, "learning_rate": 2.921052631578947e-05, "loss": 0.17, "step": 3800, "task_loss": 0.27317845821380615 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.1406727433204651, "epoch": 9.55, "learning_rate": 2.8918128654970757e-05, "loss": 0.1798, "step": 3810, "task_loss": 0.06477174162864685 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.39964333176612854, "epoch": 9.57, "learning_rate": 2.8625730994152045e-05, "loss": 0.1924, "step": 3820, "task_loss": 0.2196149230003357 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.24219584465026855, "epoch": 9.6, "learning_rate": 2.8333333333333332e-05, "loss": 0.1992, "step": 3830, "task_loss": 0.17496830224990845 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.662438690662384, "epoch": 9.62, "learning_rate": 2.804093567251462e-05, "loss": 0.176, "step": 3840, "task_loss": 0.4713020324707031 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5137396454811096, "epoch": 9.65, "learning_rate": 2.7748538011695903e-05, "loss": 0.1907, "step": 3850, "task_loss": 0.2300821840763092 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.23515579104423523, "epoch": 9.67, "learning_rate": 2.745614035087719e-05, "loss": 0.1941, "step": 3860, "task_loss": 0.13494980335235596 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.33846205472946167, "epoch": 9.7, "learning_rate": 2.7163742690058478e-05, "loss": 0.1983, "step": 3870, "task_loss": 0.23276156187057495 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2716241478919983, "epoch": 9.72, "learning_rate": 2.6871345029239765e-05, "loss": 0.1765, "step": 3880, "task_loss": 0.3164524734020233 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.38798290491104126, "epoch": 9.75, "learning_rate": 2.6578947368421052e-05, "loss": 0.2094, "step": 3890, "task_loss": 0.1473989486694336 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.15159770846366882, "epoch": 9.77, "learning_rate": 2.6286549707602336e-05, "loss": 0.2493, "step": 3900, "task_loss": 0.05544543266296387 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.34458696842193604, "epoch": 9.8, "learning_rate": 2.5994152046783623e-05, "loss": 0.1892, "step": 3910, "task_loss": 0.24102401733398438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.20027554035186768, "epoch": 9.82, "learning_rate": 2.570175438596491e-05, "loss": 0.1815, "step": 3920, "task_loss": 0.18737459182739258 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.25017499923706055, "epoch": 9.85, "learning_rate": 2.5409356725146198e-05, "loss": 0.1988, "step": 3930, "task_loss": 0.13370424509048462 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.26910173892974854, "epoch": 9.87, "learning_rate": 2.5116959064327485e-05, "loss": 0.1978, "step": 3940, "task_loss": 0.10323816537857056 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.7278082370758057, "epoch": 9.9, "learning_rate": 2.482456140350877e-05, "loss": 0.1861, "step": 3950, "task_loss": 0.40188902616500854 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4854397177696228, "epoch": 9.92, "learning_rate": 2.4532163742690056e-05, "loss": 0.217, "step": 3960, "task_loss": 0.2801436185836792 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.359090119600296, "epoch": 9.95, "learning_rate": 2.4239766081871343e-05, "loss": 0.1845, "step": 3970, "task_loss": 0.23310816287994385 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3712664842605591, "epoch": 9.97, "learning_rate": 2.394736842105263e-05, "loss": 0.2111, "step": 3980, "task_loss": 0.3080166280269623 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3844277858734131, "epoch": 10.0, "learning_rate": 2.3654970760233918e-05, "loss": 0.2053, "step": 3990, "task_loss": 0.18040329217910767 }, { "epoch": 10.0, "eval_accuracy": 0.9767578699617535, "eval_loss": 0.09344575554132462, "eval_runtime": 109.4349, "eval_samples_per_second": 62.119, "eval_steps_per_second": 1.946, "step": 3990 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3194921910762787, "epoch": 10.03, "learning_rate": 2.33625730994152e-05, "loss": 0.2147, "step": 4000, "task_loss": 0.08249035477638245 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.7327070236206055, "epoch": 10.05, "learning_rate": 2.307017543859649e-05, "loss": 0.2009, "step": 4010, "task_loss": 0.40034812688827515 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2106877863407135, "epoch": 10.08, "learning_rate": 2.2777777777777776e-05, "loss": 0.2072, "step": 4020, "task_loss": 0.25867098569869995 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.27913522720336914, "epoch": 10.1, "learning_rate": 2.2485380116959063e-05, "loss": 0.2391, "step": 4030, "task_loss": 0.2785712480545044 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.14740639925003052, "epoch": 10.13, "learning_rate": 2.219298245614035e-05, "loss": 0.1665, "step": 4040, "task_loss": 0.03724539279937744 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.03694353997707367, "epoch": 10.15, "learning_rate": 2.1900584795321638e-05, "loss": 0.1724, "step": 4050, "task_loss": 0.03579223155975342 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.48803913593292236, "epoch": 10.18, "learning_rate": 2.1608187134502922e-05, "loss": 0.1803, "step": 4060, "task_loss": 0.305889368057251 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.29847007989883423, "epoch": 10.2, "learning_rate": 2.131578947368421e-05, "loss": 0.1663, "step": 4070, "task_loss": 0.15580615401268005 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3054282069206238, "epoch": 10.23, "learning_rate": 2.1023391812865496e-05, "loss": 0.1386, "step": 4080, "task_loss": 0.20085352659225464 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3656499683856964, "epoch": 10.25, "learning_rate": 2.0730994152046784e-05, "loss": 0.2038, "step": 4090, "task_loss": 0.06400293111801147 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.27491533756256104, "epoch": 10.28, "learning_rate": 2.043859649122807e-05, "loss": 0.2023, "step": 4100, "task_loss": 0.0929376482963562 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.1880778968334198, "epoch": 10.3, "learning_rate": 2.0146198830409355e-05, "loss": 0.1673, "step": 4110, "task_loss": 0.06569665670394897 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.36783143877983093, "epoch": 10.33, "learning_rate": 1.9853801169590642e-05, "loss": 0.1728, "step": 4120, "task_loss": 0.22449851036071777 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4092685282230377, "epoch": 10.35, "learning_rate": 1.956140350877193e-05, "loss": 0.1937, "step": 4130, "task_loss": 0.14843645691871643 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.1534067690372467, "epoch": 10.38, "learning_rate": 1.9269005847953216e-05, "loss": 0.1773, "step": 4140, "task_loss": 0.0629468560218811 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3426051735877991, "epoch": 10.4, "learning_rate": 1.8976608187134504e-05, "loss": 0.1967, "step": 4150, "task_loss": 0.18895089626312256 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2175091803073883, "epoch": 10.43, "learning_rate": 1.8684210526315787e-05, "loss": 0.1879, "step": 4160, "task_loss": 0.04404401779174805 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2926103472709656, "epoch": 10.45, "learning_rate": 1.8391812865497075e-05, "loss": 0.1696, "step": 4170, "task_loss": 0.06125068664550781 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.11740519106388092, "epoch": 10.48, "learning_rate": 1.8099415204678362e-05, "loss": 0.1897, "step": 4180, "task_loss": 0.18562465906143188 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.06711611151695251, "epoch": 10.5, "learning_rate": 1.780701754385965e-05, "loss": 0.14, "step": 4190, "task_loss": 0.010339558124542236 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.31951457262039185, "epoch": 10.53, "learning_rate": 1.7514619883040936e-05, "loss": 0.1742, "step": 4200, "task_loss": 0.17789161205291748 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.24925366044044495, "epoch": 10.55, "learning_rate": 1.722222222222222e-05, "loss": 0.1216, "step": 4210, "task_loss": 0.11909815669059753 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.32841724157333374, "epoch": 10.58, "learning_rate": 1.6929824561403508e-05, "loss": 0.1491, "step": 4220, "task_loss": 0.07813969254493713 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.21618331968784332, "epoch": 10.6, "learning_rate": 1.663742690058479e-05, "loss": 0.1802, "step": 4230, "task_loss": 0.04656791687011719 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5704886317253113, "epoch": 10.63, "learning_rate": 1.634502923976608e-05, "loss": 0.1595, "step": 4240, "task_loss": 0.33919835090637207 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.03302011638879776, "epoch": 10.65, "learning_rate": 1.6052631578947366e-05, "loss": 0.1652, "step": 4250, "task_loss": 0.03273957967758179 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2786880135536194, "epoch": 10.68, "learning_rate": 1.5760233918128653e-05, "loss": 0.1777, "step": 4260, "task_loss": 0.08011233806610107 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.28394752740859985, "epoch": 10.7, "learning_rate": 1.546783625730994e-05, "loss": 0.1657, "step": 4270, "task_loss": 0.11078321933746338 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.19141581654548645, "epoch": 10.73, "learning_rate": 1.5175438596491226e-05, "loss": 0.1342, "step": 4280, "task_loss": 0.11384612321853638 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.19070206582546234, "epoch": 10.75, "learning_rate": 1.4883040935672513e-05, "loss": 0.1618, "step": 4290, "task_loss": 0.03678613901138306 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3137202858924866, "epoch": 10.78, "learning_rate": 1.4590643274853799e-05, "loss": 0.1616, "step": 4300, "task_loss": 0.40499967336654663 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.31758958101272583, "epoch": 10.8, "learning_rate": 1.4298245614035086e-05, "loss": 0.1941, "step": 4310, "task_loss": 0.08149957656860352 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.08811835944652557, "epoch": 10.83, "learning_rate": 1.4005847953216372e-05, "loss": 0.2147, "step": 4320, "task_loss": 0.07098215818405151 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.1679392009973526, "epoch": 10.85, "learning_rate": 1.3713450292397659e-05, "loss": 0.164, "step": 4330, "task_loss": 0.15692710876464844 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3241733908653259, "epoch": 10.88, "learning_rate": 1.3421052631578946e-05, "loss": 0.1727, "step": 4340, "task_loss": 0.18366020917892456 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5590642690658569, "epoch": 10.9, "learning_rate": 1.3128654970760232e-05, "loss": 0.1559, "step": 4350, "task_loss": 0.4202096462249756 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.09962806850671768, "epoch": 10.93, "learning_rate": 1.2836257309941519e-05, "loss": 0.1921, "step": 4360, "task_loss": 0.055821776390075684 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.12555056810379028, "epoch": 10.95, "learning_rate": 1.2543859649122804e-05, "loss": 0.1507, "step": 4370, "task_loss": 0.17544305324554443 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.11806661635637283, "epoch": 10.98, "learning_rate": 1.2251461988304092e-05, "loss": 0.1543, "step": 4380, "task_loss": 0.05767279863357544 }, { "epoch": 11.0, "eval_accuracy": 0.9779346866725508, "eval_loss": 0.08748478442430496, "eval_runtime": 88.9404, "eval_samples_per_second": 76.433, "eval_steps_per_second": 2.395, "step": 4389 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.19886839389801025, "epoch": 11.0, "learning_rate": 1.1959064327485379e-05, "loss": 0.1981, "step": 4390, "task_loss": 0.26288312673568726 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.11590077728033066, "epoch": 11.03, "learning_rate": 1.1666666666666665e-05, "loss": 0.1666, "step": 4400, "task_loss": 0.06784418225288391 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.21343311667442322, "epoch": 11.05, "learning_rate": 1.1374269005847952e-05, "loss": 0.1697, "step": 4410, "task_loss": 0.0758419930934906 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.30859267711639404, "epoch": 11.08, "learning_rate": 1.1081871345029239e-05, "loss": 0.1896, "step": 4420, "task_loss": 0.07110640406608582 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2189975082874298, "epoch": 11.1, "learning_rate": 1.0789473684210525e-05, "loss": 0.1926, "step": 4430, "task_loss": 0.05106937885284424 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.39309024810791016, "epoch": 11.13, "learning_rate": 1.0497076023391812e-05, "loss": 0.1556, "step": 4440, "task_loss": 0.23538663983345032 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5418274998664856, "epoch": 11.15, "learning_rate": 1.0204678362573097e-05, "loss": 0.1725, "step": 4450, "task_loss": 0.2049427330493927 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5003339648246765, "epoch": 11.18, "learning_rate": 9.912280701754385e-06, "loss": 0.1857, "step": 4460, "task_loss": 0.20348548889160156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.20201507210731506, "epoch": 11.2, "learning_rate": 9.619883040935672e-06, "loss": 0.1821, "step": 4470, "task_loss": 0.06827902793884277 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.4064372181892395, "epoch": 11.23, "learning_rate": 9.327485380116957e-06, "loss": 0.1734, "step": 4480, "task_loss": 0.2290695309638977 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.25272685289382935, "epoch": 11.25, "learning_rate": 9.035087719298245e-06, "loss": 0.1609, "step": 4490, "task_loss": 0.09427431225776672 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.35564708709716797, "epoch": 11.28, "learning_rate": 8.74269005847953e-06, "loss": 0.1769, "step": 4500, "task_loss": 0.15388613939285278 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.13362084329128265, "epoch": 11.3, "learning_rate": 8.450292397660817e-06, "loss": 0.1779, "step": 4510, "task_loss": 0.07664147019386292 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.05402388423681259, "epoch": 11.33, "learning_rate": 8.157894736842105e-06, "loss": 0.1608, "step": 4520, "task_loss": 0.013635486364364624 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.47015678882598877, "epoch": 11.35, "learning_rate": 7.86549707602339e-06, "loss": 0.1832, "step": 4530, "task_loss": 0.17718610167503357 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.36746612191200256, "epoch": 11.38, "learning_rate": 7.5730994152046775e-06, "loss": 0.1559, "step": 4540, "task_loss": 0.26919397711753845 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3066258430480957, "epoch": 11.4, "learning_rate": 7.280701754385964e-06, "loss": 0.1626, "step": 4550, "task_loss": 0.08805280923843384 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.21340705454349518, "epoch": 11.43, "learning_rate": 6.98830409356725e-06, "loss": 0.1768, "step": 4560, "task_loss": 0.16585972905158997 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.5809781551361084, "epoch": 11.45, "learning_rate": 6.695906432748537e-06, "loss": 0.1898, "step": 4570, "task_loss": 0.31455641984939575 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.31768062710762024, "epoch": 11.48, "learning_rate": 6.403508771929824e-06, "loss": 0.1649, "step": 4580, "task_loss": 0.2114783525466919 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.23975571990013123, "epoch": 11.5, "learning_rate": 6.11111111111111e-06, "loss": 0.1643, "step": 4590, "task_loss": 0.034987449645996094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.40438854694366455, "epoch": 11.53, "learning_rate": 5.818713450292397e-06, "loss": 0.1531, "step": 4600, "task_loss": 0.2620016932487488 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.11603149771690369, "epoch": 11.55, "learning_rate": 5.526315789473683e-06, "loss": 0.1272, "step": 4610, "task_loss": 0.04970061779022217 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.04775264859199524, "epoch": 11.58, "learning_rate": 5.23391812865497e-06, "loss": 0.1616, "step": 4620, "task_loss": 0.02132624387741089 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.11175966262817383, "epoch": 11.6, "learning_rate": 4.941520467836257e-06, "loss": 0.1758, "step": 4630, "task_loss": 0.09407076239585876 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.03183834254741669, "epoch": 11.63, "learning_rate": 4.649122807017543e-06, "loss": 0.1642, "step": 4640, "task_loss": 0.011670589447021484 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.03778667002916336, "epoch": 11.65, "learning_rate": 4.35672514619883e-06, "loss": 0.1842, "step": 4650, "task_loss": 0.11324834823608398 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.06027041748166084, "epoch": 11.68, "learning_rate": 4.064327485380116e-06, "loss": 0.1553, "step": 4660, "task_loss": 0.03356653451919556 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.10275101661682129, "epoch": 11.7, "learning_rate": 3.771929824561403e-06, "loss": 0.169, "step": 4670, "task_loss": 0.04669731855392456 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.10769091546535492, "epoch": 11.73, "learning_rate": 3.4795321637426897e-06, "loss": 0.1337, "step": 4680, "task_loss": 0.041588425636291504 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.18004803359508514, "epoch": 11.75, "learning_rate": 3.187134502923976e-06, "loss": 0.1598, "step": 4690, "task_loss": 0.06004643440246582 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.06837992370128632, "epoch": 11.78, "learning_rate": 2.894736842105263e-06, "loss": 0.1706, "step": 4700, "task_loss": 0.10142296552658081 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.3725603222846985, "epoch": 11.8, "learning_rate": 2.6023391812865493e-06, "loss": 0.1747, "step": 4710, "task_loss": 0.1912306547164917 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.24147792160511017, "epoch": 11.83, "learning_rate": 2.3099415204678357e-06, "loss": 0.1578, "step": 4720, "task_loss": 0.1674221158027649 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.44135892391204834, "epoch": 11.85, "learning_rate": 2.0175438596491226e-06, "loss": 0.1313, "step": 4730, "task_loss": 0.21962109208106995 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2678522765636444, "epoch": 11.88, "learning_rate": 1.7251461988304092e-06, "loss": 0.1521, "step": 4740, "task_loss": 0.0885564386844635 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.45461341738700867, "epoch": 11.9, "learning_rate": 1.4327485380116958e-06, "loss": 0.1516, "step": 4750, "task_loss": 0.17013055086135864 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.05051502212882042, "epoch": 11.93, "learning_rate": 1.1403508771929824e-06, "loss": 0.1645, "step": 4760, "task_loss": 0.129602313041687 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.2683265805244446, "epoch": 11.95, "learning_rate": 8.479532163742689e-07, "loss": 0.1474, "step": 4770, "task_loss": 0.14865410327911377 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, "compression/movement_sparsity/model_sparsity": 0.5730886610217056, "compression_loss": 0.0, "distillation_loss": 0.37407612800598145, "epoch": 11.98, "learning_rate": 5.555555555555555e-07, "loss": 0.1836, "step": 4780, "task_loss": 0.14674681425094604 }, { "epoch": 12.0, "eval_accuracy": 0.9794057075610474, "eval_loss": 0.08688130974769592, "eval_runtime": 89.0371, "eval_samples_per_second": 76.35, "eval_steps_per_second": 2.392, "step": 4788 }, { "epoch": 12.0, "step": 4788, "total_flos": 5.579752612756608e+18, "train_loss": 3.67681538981503, "train_runtime": 15601.2245, "train_samples_per_second": 39.3, "train_steps_per_second": 0.307 } ], "max_steps": 4788, "num_train_epochs": 12, "total_flos": 5.579752612756608e+18, "trial_name": null, "trial_params": null }