{ "best_metric": null, "best_model_checkpoint": null, "epoch": 11.999373825923607, "global_step": 4788, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.995430946350098, "epoch": 0.03, "learning_rate": 2.0467836257309939e-07, "loss": 3.0412, "step": 10, "task_loss": 2.487548828125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.383060455322266, "epoch": 0.05, "learning_rate": 4.6783625730994144e-07, "loss": 3.0355, "step": 20, "task_loss": 2.48077392578125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.193294525146484, "epoch": 0.08, "learning_rate": 7.602339181286548e-07, "loss": 3.0144, "step": 30, "task_loss": 2.46014404296875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.485038757324219, "epoch": 0.1, "learning_rate": 1.0526315789473683e-06, "loss": 2.9724, "step": 40, "task_loss": 2.44512939453125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.449984550476074, "epoch": 0.13, "learning_rate": 1.3450292397660817e-06, "loss": 2.9464, "step": 50, "task_loss": 2.33709716796875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.332710266113281, "epoch": 0.15, "learning_rate": 1.637426900584795e-06, "loss": 2.8964, "step": 60, "task_loss": 2.27166748046875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.976663589477539, "epoch": 0.18, "learning_rate": 1.9298245614035085e-06, "loss": 2.8193, "step": 70, "task_loss": 2.18878173828125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.810410976409912, "epoch": 0.2, "learning_rate": 2.222222222222222e-06, "loss": 2.7893, "step": 80, "task_loss": 2.187286376953125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.629610061645508, "epoch": 0.23, "learning_rate": 2.5146198830409352e-06, "loss": 2.6867, "step": 90, "task_loss": 2.094635009765625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.557668685913086, "epoch": 0.25, "learning_rate": 2.807017543859649e-06, "loss": 2.6471, "step": 100, "task_loss": 2.004180908203125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.144706726074219, "epoch": 0.28, "learning_rate": 3.099415204678362e-06, "loss": 2.6164, "step": 110, "task_loss": 1.87591552734375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.510150909423828, "epoch": 0.3, "learning_rate": 3.3918128654970756e-06, "loss": 2.5618, "step": 120, "task_loss": 1.8662109375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.989267349243164, "epoch": 0.33, "learning_rate": 3.684210526315789e-06, "loss": 2.5334, "step": 130, "task_loss": 1.986572265625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.915393829345703, "epoch": 0.35, "learning_rate": 3.976608187134502e-06, "loss": 2.4574, "step": 140, "task_loss": 1.77105712890625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.035706520080566, "epoch": 0.38, "learning_rate": 4.269005847953216e-06, "loss": 2.3992, "step": 150, "task_loss": 1.81634521484375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.917450428009033, "epoch": 0.4, "learning_rate": 4.56140350877193e-06, "loss": 2.4227, "step": 160, "task_loss": 1.72393798828125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.47239875793457, "epoch": 0.43, "learning_rate": 4.853801169590642e-06, "loss": 2.4069, "step": 170, "task_loss": 1.76953125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.833039283752441, "epoch": 0.45, "learning_rate": 5.146198830409356e-06, "loss": 2.3883, "step": 180, "task_loss": 1.574005126953125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.729918479919434, "epoch": 0.48, "learning_rate": 5.4385964912280695e-06, "loss": 2.3209, "step": 190, "task_loss": 1.912689208984375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.127784729003906, "epoch": 0.5, "learning_rate": 5.730994152046783e-06, "loss": 2.3481, "step": 200, "task_loss": 1.5352325439453125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.22283935546875, "epoch": 0.53, "learning_rate": 6.023391812865496e-06, "loss": 2.3197, "step": 210, "task_loss": 1.6494598388671875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.147849082946777, "epoch": 0.55, "learning_rate": 6.3157894736842095e-06, "loss": 2.2953, "step": 220, "task_loss": 1.684326171875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.035430908203125, "epoch": 0.58, "learning_rate": 6.608187134502923e-06, "loss": 2.3397, "step": 230, "task_loss": 1.548065185546875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.975306510925293, "epoch": 0.6, "learning_rate": 6.900584795321637e-06, "loss": 2.346, "step": 240, "task_loss": 1.9248809814453125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.835908889770508, "epoch": 0.63, "learning_rate": 7.1929824561403494e-06, "loss": 2.2726, "step": 250, "task_loss": 1.675628662109375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.860301971435547, "epoch": 0.65, "learning_rate": 7.485380116959063e-06, "loss": 2.2569, "step": 260, "task_loss": 1.73553466796875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.720399856567383, "epoch": 0.68, "learning_rate": 7.777777777777777e-06, "loss": 2.2592, "step": 270, "task_loss": 1.951629638671875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.569833278656006, "epoch": 0.7, "learning_rate": 8.07017543859649e-06, "loss": 2.1852, "step": 280, "task_loss": 1.224029541015625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.938345909118652, "epoch": 0.73, "learning_rate": 8.362573099415203e-06, "loss": 2.2498, "step": 290, "task_loss": 1.57440185546875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.931737899780273, "epoch": 0.75, "learning_rate": 8.654970760233917e-06, "loss": 2.3263, "step": 300, "task_loss": 1.5814971923828125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 7.799921035766602, "epoch": 0.78, "learning_rate": 8.94736842105263e-06, "loss": 2.2039, "step": 310, "task_loss": 1.4974365234375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.04701042175293, "epoch": 0.8, "learning_rate": 9.239766081871343e-06, "loss": 2.2692, "step": 320, "task_loss": 1.415435791015625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.033458709716797, "epoch": 0.83, "learning_rate": 9.532163742690057e-06, "loss": 2.2481, "step": 330, "task_loss": 1.709228515625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.039294242858887, "epoch": 0.85, "learning_rate": 9.82456140350877e-06, "loss": 2.2423, "step": 340, "task_loss": 1.4802398681640625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.168000221252441, "epoch": 0.88, "learning_rate": 1.0116959064327485e-05, "loss": 2.1574, "step": 350, "task_loss": 1.9547576904296875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.144676208496094, "epoch": 0.9, "learning_rate": 1.0409356725146197e-05, "loss": 2.3151, "step": 360, "task_loss": 1.858917236328125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.119178771972656, "epoch": 0.93, "learning_rate": 1.070175438596491e-05, "loss": 2.2298, "step": 370, "task_loss": 1.600616455078125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.317169189453125, "epoch": 0.95, "learning_rate": 1.0994152046783625e-05, "loss": 2.2426, "step": 380, "task_loss": 1.474639892578125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, "distillation_loss": 8.469932556152344, "epoch": 0.98, "learning_rate": 1.1286549707602337e-05, "loss": 2.2245, "step": 390, "task_loss": 1.885498046875 }, { "epoch": 1.0, "eval_accuracy": 0.6209179170344219, "eval_loss": 2.2351226806640625, "eval_runtime": 32.365, "eval_samples_per_second": 210.042, "eval_steps_per_second": 3.306, "step": 399 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -0.0006927956128492951, "compression/movement_sparsity/linear_layer_sparsity": 0.001006521002710027, "compression/movement_sparsity/model_sparsity": 0.0009048376648987656, "compression_loss": 0.0, "distillation_loss": 8.044456481933594, "epoch": 1.0, "learning_rate": 1.1549707602339179e-05, "loss": 2.2384, "step": 400, "task_loss": 1.415863037109375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0005970049999999993, "compression/movement_sparsity/importance_threshold": -0.0006824555517280679, "compression/movement_sparsity/linear_layer_sparsity": 0.0012294983626919603, "compression/movement_sparsity/model_sparsity": 0.0011052888360001294, "compression_loss": 0.16241011023521423, "distillation_loss": 8.000807762145996, "epoch": 1.03, "learning_rate": 1.1812865497076021e-05, "loss": 2.297, "step": 410, "task_loss": 1.533203125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0011880399999999991, "compression/movement_sparsity/importance_threshold": -0.0006722188903520583, "compression/movement_sparsity/linear_layer_sparsity": 0.0018773054049984945, "compression/movement_sparsity/model_sparsity": 0.0016876514592215047, "compression_loss": 0.32319411635398865, "distillation_loss": 7.843788146972656, "epoch": 1.05, "learning_rate": 1.2105263157894735e-05, "loss": 2.3983, "step": 420, "task_loss": 1.4488372802734375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0017731350000000036, "compression/movement_sparsity/importance_threshold": -0.0006620851091245567, "compression/movement_sparsity/linear_layer_sparsity": 0.003179306402439024, "compression/movement_sparsity/model_sparsity": 0.0028581183834565237, "compression_loss": 0.4823570251464844, "distillation_loss": 7.915602684020996, "epoch": 1.08, "learning_rate": 1.2368421052631577e-05, "loss": 2.6136, "step": 430, "task_loss": 1.73846435546875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0023523200000000032, "compression/movement_sparsity/importance_threshold": -0.0006520536884488537, "compression/movement_sparsity/linear_layer_sparsity": 0.004699579851701295, "compression/movement_sparsity/model_sparsity": 0.0042248068818925245, "compression_loss": 0.6399109959602356, "distillation_loss": 7.839539527893066, "epoch": 1.1, "learning_rate": 1.2631578947368419e-05, "loss": 2.7967, "step": 440, "task_loss": 1.6051025390625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.002925625000000003, "compression/movement_sparsity/importance_threshold": -0.0006421241087282396, "compression/movement_sparsity/linear_layer_sparsity": 0.007616611148750376, "compression/movement_sparsity/model_sparsity": 0.006847146386137242, "compression_loss": 0.7958579063415527, "distillation_loss": 7.503289222717285, "epoch": 1.13, "learning_rate": 1.2923976608187133e-05, "loss": 2.9338, "step": 450, "task_loss": 1.3809051513671875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0034930800000000017, "compression/movement_sparsity/importance_threshold": -0.0006322958503660047, "compression/movement_sparsity/linear_layer_sparsity": 0.014220867208672086, "compression/movement_sparsity/model_sparsity": 0.012784210407219245, "compression_loss": 0.9502034187316895, "distillation_loss": 8.325506210327148, "epoch": 1.15, "learning_rate": 1.3216374269005846e-05, "loss": 3.0766, "step": 460, "task_loss": 1.928070068359375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.004054715000000005, "compression/movement_sparsity/importance_threshold": -0.0006225683937654393, "compression/movement_sparsity/linear_layer_sparsity": 0.024098351400180667, "compression/movement_sparsity/model_sparsity": 0.021663826139881652, "compression_loss": 1.1029574871063232, "distillation_loss": 7.289924621582031, "epoch": 1.18, "learning_rate": 1.3508771929824559e-05, "loss": 3.1735, "step": 470, "task_loss": 1.4115447998046875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.004610560000000006, "compression/movement_sparsity/importance_threshold": -0.0006129412193298339, "compression/movement_sparsity/linear_layer_sparsity": 0.03601247271153267, "compression/movement_sparsity/model_sparsity": 0.032374328630797006, "compression_loss": 1.2541292905807495, "distillation_loss": 8.556415557861328, "epoch": 1.2, "learning_rate": 1.3801169590643273e-05, "loss": 3.3443, "step": 480, "task_loss": 1.9100341796875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.005160645000000006, "compression/movement_sparsity/importance_threshold": -0.0006034138074624787, "compression/movement_sparsity/linear_layer_sparsity": 0.050367029885576634, "compression/movement_sparsity/model_sparsity": 0.045278723033940614, "compression_loss": 1.4037203788757324, "distillation_loss": 7.887024879455566, "epoch": 1.23, "learning_rate": 1.4093567251461986e-05, "loss": 3.4895, "step": 490, "task_loss": 1.471527099609375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.005705000000000004, "compression/movement_sparsity/importance_threshold": -0.0005939856385666644, "compression/movement_sparsity/linear_layer_sparsity": 0.06657988463565191, "compression/movement_sparsity/model_sparsity": 0.05985368132482818, "compression_loss": 1.551735520362854, "distillation_loss": 7.81777286529541, "epoch": 1.25, "learning_rate": 1.4385964912280699e-05, "loss": 3.5889, "step": 500, "task_loss": 1.650634765625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.006243655000000006, "compression/movement_sparsity/importance_threshold": -0.0005846561930456809, "compression/movement_sparsity/linear_layer_sparsity": 0.08038760209650708, "compression/movement_sparsity/model_sparsity": 0.07226648025423262, "compression_loss": 1.6981920003890991, "distillation_loss": 7.416506290435791, "epoch": 1.28, "learning_rate": 1.4678362573099413e-05, "loss": 3.8023, "step": 510, "task_loss": 1.49237060546875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.006776640000000005, "compression/movement_sparsity/importance_threshold": -0.0005754249513028189, "compression/movement_sparsity/linear_layer_sparsity": 0.09304496951219512, "compression/movement_sparsity/model_sparsity": 0.0836451427414936, "compression_loss": 1.843092679977417, "distillation_loss": 7.790732383728027, "epoch": 1.3, "learning_rate": 1.4970760233918126e-05, "loss": 3.907, "step": 520, "task_loss": 1.5792388916015625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.007303984999999993, "compression/movement_sparsity/importance_threshold": -0.0005662913937413687, "compression/movement_sparsity/linear_layer_sparsity": 0.10428879432023487, "compression/movement_sparsity/model_sparsity": 0.09375306513600375, "compression_loss": 1.9864468574523926, "distillation_loss": 8.140114784240723, "epoch": 1.33, "learning_rate": 1.526315789473684e-05, "loss": 4.0017, "step": 530, "task_loss": 1.455169677734375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.007825719999999996, "compression/movement_sparsity/importance_threshold": -0.0005572550007646206, "compression/movement_sparsity/linear_layer_sparsity": 0.11561735452423968, "compression/movement_sparsity/model_sparsity": 0.10393716257068979, "compression_loss": 2.1282644271850586, "distillation_loss": 8.059106826782227, "epoch": 1.35, "learning_rate": 1.5555555555555555e-05, "loss": 4.1032, "step": 540, "task_loss": 1.761474609375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.008341874999999997, "compression/movement_sparsity/importance_threshold": -0.0005483152527758649, "compression/movement_sparsity/linear_layer_sparsity": 0.1275118681308341, "compression/movement_sparsity/model_sparsity": 0.11463003821652279, "compression_loss": 2.2685375213623047, "distillation_loss": 8.375931739807129, "epoch": 1.38, "learning_rate": 1.5847953216374268e-05, "loss": 4.2944, "step": 550, "task_loss": 1.9170379638671875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.008852479999999998, "compression/movement_sparsity/importance_threshold": -0.000539471630178392, "compression/movement_sparsity/linear_layer_sparsity": 0.13925447201520627, "compression/movement_sparsity/model_sparsity": 0.12518635075243473, "compression_loss": 2.4072768688201904, "distillation_loss": 7.257189750671387, "epoch": 1.4, "learning_rate": 1.614035087719298e-05, "loss": 4.4048, "step": 560, "task_loss": 1.4711456298828125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009357564999999997, "compression/movement_sparsity/importance_threshold": -0.0005307236133754923, "compression/movement_sparsity/linear_layer_sparsity": 0.15153005965823546, "compression/movement_sparsity/model_sparsity": 0.13622180259921418, "compression_loss": 2.5445077419281006, "distillation_loss": 7.987864971160889, "epoch": 1.43, "learning_rate": 1.6432748538011693e-05, "loss": 4.4927, "step": 570, "task_loss": 1.3629608154296875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009857159999999997, "compression/movement_sparsity/importance_threshold": -0.0005220706827704563, "compression/movement_sparsity/linear_layer_sparsity": 0.1634156574638663, "compression/movement_sparsity/model_sparsity": 0.14690666315892098, "compression_loss": 2.680231809616089, "distillation_loss": 7.393160820007324, "epoch": 1.45, "learning_rate": 1.6725146198830406e-05, "loss": 4.6605, "step": 580, "task_loss": 1.32977294921875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.010351294999999996, "compression/movement_sparsity/importance_threshold": -0.0005135123187665742, "compression/movement_sparsity/linear_layer_sparsity": 0.17530592489084612, "compression/movement_sparsity/model_sparsity": 0.1575957215935514, "compression_loss": 2.814448595046997, "distillation_loss": 7.518686294555664, "epoch": 1.48, "learning_rate": 1.701754385964912e-05, "loss": 4.789, "step": 590, "task_loss": 1.121612548828125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.010839999999999997, "compression/movement_sparsity/importance_threshold": -0.0005050480017671363, "compression/movement_sparsity/linear_layer_sparsity": 0.18872400538241493, "compression/movement_sparsity/model_sparsity": 0.16965824645565067, "compression_loss": 2.947171688079834, "distillation_loss": 7.525865077972412, "epoch": 1.5, "learning_rate": 1.7309941520467835e-05, "loss": 4.9268, "step": 600, "task_loss": 1.5207061767578125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011323304999999997, "compression/movement_sparsity/importance_threshold": -0.000496677212175433, "compression/movement_sparsity/linear_layer_sparsity": 0.20174494457618186, "compression/movement_sparsity/model_sparsity": 0.1813637510433887, "compression_loss": 3.078418731689453, "distillation_loss": 7.749105453491211, "epoch": 1.53, "learning_rate": 1.7602339181286548e-05, "loss": 5.0, "step": 610, "task_loss": 1.4150848388671875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011801239999999998, "compression/movement_sparsity/importance_threshold": -0.0004883994303947548, "compression/movement_sparsity/linear_layer_sparsity": 0.21374602435260465, "compression/movement_sparsity/model_sparsity": 0.19215242705901536, "compression_loss": 3.2082204818725586, "distillation_loss": 7.886735916137695, "epoch": 1.55, "learning_rate": 1.789473684210526e-05, "loss": 5.1341, "step": 620, "task_loss": 1.52386474609375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012273834999999997, "compression/movement_sparsity/importance_threshold": -0.000480214136828392, "compression/movement_sparsity/linear_layer_sparsity": 0.22315820488934057, "compression/movement_sparsity/model_sparsity": 0.20061374623222222, "compression_loss": 3.336570978164673, "distillation_loss": 7.682252883911133, "epoch": 1.58, "learning_rate": 1.8187134502923973e-05, "loss": 5.2778, "step": 630, "task_loss": 1.2125701904296875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012741120000000002, "compression/movement_sparsity/importance_threshold": -0.00047212081187963484, "compression/movement_sparsity/linear_layer_sparsity": 0.2345525867585065, "compression/movement_sparsity/model_sparsity": 0.2108570157275446, "compression_loss": 3.463460922241211, "distillation_loss": 7.583769798278809, "epoch": 1.6, "learning_rate": 1.8479532163742686e-05, "loss": 5.4233, "step": 640, "task_loss": 1.331329345703125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.013203125, "compression/movement_sparsity/importance_threshold": -0.0004641189359517739, "compression/movement_sparsity/linear_layer_sparsity": 0.24699830388060826, "compression/movement_sparsity/model_sparsity": 0.2220454097982419, "compression_loss": 3.588883638381958, "distillation_loss": 7.881678104400635, "epoch": 1.63, "learning_rate": 1.8771929824561402e-05, "loss": 5.5528, "step": 650, "task_loss": 1.7361907958984375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.013659880000000003, "compression/movement_sparsity/importance_threshold": -0.00045620798944809933, "compression/movement_sparsity/linear_layer_sparsity": 0.26060269873532066, "compression/movement_sparsity/model_sparsity": 0.2342754266975963, "compression_loss": 3.7128448486328125, "distillation_loss": 7.501363754272461, "epoch": 1.65, "learning_rate": 1.9064327485380115e-05, "loss": 5.6663, "step": 660, "task_loss": 1.44635009765625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014111415000000002, "compression/movement_sparsity/importance_threshold": -0.0004483874527719017, "compression/movement_sparsity/linear_layer_sparsity": 0.27815656993375487, "compression/movement_sparsity/model_sparsity": 0.2500559258450153, "compression_loss": 3.8353116512298584, "distillation_loss": 7.70191764831543, "epoch": 1.68, "learning_rate": 1.9356725146198827e-05, "loss": 5.7849, "step": 670, "task_loss": 1.49664306640625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014557760000000003, "compression/movement_sparsity/importance_threshold": -0.00044065680632647124, "compression/movement_sparsity/linear_layer_sparsity": 0.2952470312029509, "compression/movement_sparsity/model_sparsity": 0.2654198308457312, "compression_loss": 3.956355094909668, "distillation_loss": 7.210139274597168, "epoch": 1.7, "learning_rate": 1.964912280701754e-05, "loss": 5.8788, "step": 680, "task_loss": 1.430419921875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014998945000000003, "compression/movement_sparsity/importance_threshold": -0.0004330155305150983, "compression/movement_sparsity/linear_layer_sparsity": 0.31303866493526045, "compression/movement_sparsity/model_sparsity": 0.281414072672511, "compression_loss": 4.075984477996826, "distillation_loss": 6.941951751708984, "epoch": 1.73, "learning_rate": 1.9941520467836253e-05, "loss": 5.9629, "step": 690, "task_loss": 1.329498291015625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.015435000000000004, "compression/movement_sparsity/importance_threshold": -0.00042546310574107334, "compression/movement_sparsity/linear_layer_sparsity": 0.3283816997892201, "compression/movement_sparsity/model_sparsity": 0.2952070842364404, "compression_loss": 4.194240093231201, "distillation_loss": 7.564118385314941, "epoch": 1.75, "learning_rate": 2.023391812865497e-05, "loss": 6.0479, "step": 700, "task_loss": 1.326568603515625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.015865955, "compression/movement_sparsity/importance_threshold": -0.00041799901240768665, "compression/movement_sparsity/linear_layer_sparsity": 0.34366781701671184, "compression/movement_sparsity/model_sparsity": 0.3089489282518674, "compression_loss": 4.311094284057617, "distillation_loss": 7.411632537841797, "epoch": 1.78, "learning_rate": 2.0526315789473682e-05, "loss": 6.1386, "step": 710, "task_loss": 1.1554412841796875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016291840000000005, "compression/movement_sparsity/importance_threshold": -0.00041062273091822853, "compression/movement_sparsity/linear_layer_sparsity": 0.35876836090409514, "compression/movement_sparsity/model_sparsity": 0.32252394639155113, "compression_loss": 4.426570415496826, "distillation_loss": 7.248075008392334, "epoch": 1.8, "learning_rate": 2.0818713450292395e-05, "loss": 6.2603, "step": 720, "task_loss": 1.391021728515625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016712685000000005, "compression/movement_sparsity/importance_threshold": -0.0004033337416759895, "compression/movement_sparsity/linear_layer_sparsity": 0.37388741860508884, "compression/movement_sparsity/model_sparsity": 0.3361156079950378, "compression_loss": 4.54065465927124, "distillation_loss": 7.62760591506958, "epoch": 1.83, "learning_rate": 2.1111111111111107e-05, "loss": 6.455, "step": 730, "task_loss": 1.4605560302734375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017128519999999994, "compression/movement_sparsity/importance_threshold": -0.00039613152508426, "compression/movement_sparsity/linear_layer_sparsity": 0.38791397310674497, "compression/movement_sparsity/model_sparsity": 0.3487251360502713, "compression_loss": 4.65338134765625, "distillation_loss": 7.30867338180542, "epoch": 1.85, "learning_rate": 2.140350877192982e-05, "loss": 6.5288, "step": 740, "task_loss": 1.2646942138671875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017539375000000003, "compression/movement_sparsity/importance_threshold": -0.00038901556154633, "compression/movement_sparsity/linear_layer_sparsity": 0.40139734596130683, "compression/movement_sparsity/model_sparsity": 0.3608463571433556, "compression_loss": 4.764727592468262, "distillation_loss": 7.188628196716309, "epoch": 1.88, "learning_rate": 2.1695906432748536e-05, "loss": 6.5733, "step": 750, "task_loss": 1.35302734375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017945279999999997, "compression/movement_sparsity/importance_threshold": -0.0003819853314654902, "compression/movement_sparsity/linear_layer_sparsity": 0.4167137157482686, "compression/movement_sparsity/model_sparsity": 0.37461539746685213, "compression_loss": 4.874687671661377, "distillation_loss": 7.548184394836426, "epoch": 1.9, "learning_rate": 2.198830409356725e-05, "loss": 6.7677, "step": 760, "task_loss": 1.2998046875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.018346265000000007, "compression/movement_sparsity/importance_threshold": -0.0003750403152450307, "compression/movement_sparsity/linear_layer_sparsity": 0.4299143824337549, "compression/movement_sparsity/model_sparsity": 0.38648247265618474, "compression_loss": 4.983335971832275, "distillation_loss": 7.388566970825195, "epoch": 1.93, "learning_rate": 2.2280701754385962e-05, "loss": 6.8162, "step": 770, "task_loss": 1.3633880615234375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01874236, "compression/movement_sparsity/importance_threshold": -0.0003681799932882423, "compression/movement_sparsity/linear_layer_sparsity": 0.4441794052055104, "compression/movement_sparsity/model_sparsity": 0.3993063778303141, "compression_loss": 5.090628623962402, "distillation_loss": 7.243496894836426, "epoch": 1.95, "learning_rate": 2.2573099415204675e-05, "loss": 6.895, "step": 780, "task_loss": 1.0947265625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.019133595000000003, "compression/movement_sparsity/importance_threshold": -0.00036140384599841487, "compression/movement_sparsity/linear_layer_sparsity": 0.4571137152777778, "compression/movement_sparsity/model_sparsity": 0.4109340049651238, "compression_loss": 5.196588039398193, "distillation_loss": 6.702712059020996, "epoch": 1.98, "learning_rate": 2.2865497076023387e-05, "loss": 6.9856, "step": 790, "task_loss": 1.150970458984375 }, { "epoch": 2.0, "eval_accuracy": 0.7353633421594586, "eval_loss": 7.059737682342529, "eval_runtime": 32.65, "eval_samples_per_second": 208.208, "eval_steps_per_second": 3.277, "step": 798 }, { "compression/movement_sparsity/importance_regularization_factor": 0.019558376005000003, "compression/movement_sparsity/importance_threshold": -0.000354046685581272, "compression/movement_sparsity/linear_layer_sparsity": 0.47266684780563084, "compression/movement_sparsity/model_sparsity": 0.42491588917863976, "compression_loss": 5.311591625213623, "distillation_loss": 6.95596981048584, "epoch": 2.01, "learning_rate": 2.3157894736842103e-05, "loss": 7.3249, "step": 800, "task_loss": 1.1584014892578125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.019939502655, "compression/movement_sparsity/importance_threshold": -0.00034744561380477333, "compression/movement_sparsity/linear_layer_sparsity": 0.49021440266486, "compression/movement_sparsity/model_sparsity": 0.44069071009222, "compression_loss": 5.4146504402160645, "distillation_loss": 7.146793365478516, "epoch": 2.03, "learning_rate": 2.3450292397660816e-05, "loss": 7.2369, "step": 810, "task_loss": 1.382080078125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.020315862305000005, "compression/movement_sparsity/importance_threshold": -0.0003409271059454359, "compression/movement_sparsity/linear_layer_sparsity": 0.5090454682324601, "compression/movement_sparsity/model_sparsity": 0.4576193756142168, "compression_loss": 5.5163493156433105, "distillation_loss": 7.342703342437744, "epoch": 2.06, "learning_rate": 2.374269005847953e-05, "loss": 7.3456, "step": 820, "task_loss": 1.3241424560546875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.020687484955000005, "compression/movement_sparsity/importance_threshold": -0.0003344906424065501, "compression/movement_sparsity/linear_layer_sparsity": 0.5285819877295995, "compression/movement_sparsity/model_sparsity": 0.47518222689545686, "compression_loss": 5.616704940795898, "distillation_loss": 7.162722587585449, "epoch": 2.08, "learning_rate": 2.403508771929824e-05, "loss": 7.3943, "step": 830, "task_loss": 1.150177001953125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021054400605000002, "compression/movement_sparsity/importance_threshold": -0.00032813570359140645, "compression/movement_sparsity/linear_layer_sparsity": 0.546536564193767, "compression/movement_sparsity/model_sparsity": 0.49132295023688916, "compression_loss": 5.715770244598389, "distillation_loss": 6.800263404846191, "epoch": 2.11, "learning_rate": 2.4327485380116954e-05, "loss": 7.5439, "step": 840, "task_loss": 1.088836669921875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021416639255000005, "compression/movement_sparsity/importance_threshold": -0.00032186176990329514, "compression/movement_sparsity/linear_layer_sparsity": 0.5614409251731406, "compression/movement_sparsity/model_sparsity": 0.5047216047598192, "compression_loss": 5.813632011413574, "distillation_loss": 7.243762016296387, "epoch": 2.13, "learning_rate": 2.461988304093567e-05, "loss": 7.6294, "step": 850, "task_loss": 1.623992919921875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021774230905000004, "compression/movement_sparsity/importance_threshold": -0.0003156683217455066, "compression/movement_sparsity/linear_layer_sparsity": 0.5762013982987052, "compression/movement_sparsity/model_sparsity": 0.5179909076355432, "compression_loss": 5.910200595855713, "distillation_loss": 6.999300479888916, "epoch": 2.16, "learning_rate": 2.4912280701754383e-05, "loss": 7.6402, "step": 860, "task_loss": 1.3790283203125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022127205555, "compression/movement_sparsity/importance_threshold": -0.0003095548395213313, "compression/movement_sparsity/linear_layer_sparsity": 0.5922373955510388, "compression/movement_sparsity/model_sparsity": 0.5324068753789455, "compression_loss": 6.005468845367432, "distillation_loss": 6.536607265472412, "epoch": 2.18, "learning_rate": 2.5204678362573096e-05, "loss": 7.7517, "step": 870, "task_loss": 1.020782470703125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022475593205000005, "compression/movement_sparsity/importance_threshold": -0.0003035208036340593, "compression/movement_sparsity/linear_layer_sparsity": 0.6046185142841012, "compression/movement_sparsity/model_sparsity": 0.5435371970842005, "compression_loss": 6.09952449798584, "distillation_loss": 6.741325378417969, "epoch": 2.21, "learning_rate": 2.549707602339181e-05, "loss": 7.7904, "step": 880, "task_loss": 1.1522979736328125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022819423855000003, "compression/movement_sparsity/importance_threshold": -0.00029756569448698134, "compression/movement_sparsity/linear_layer_sparsity": 0.6168116954607046, "compression/movement_sparsity/model_sparsity": 0.5544985675412698, "compression_loss": 6.1923980712890625, "distillation_loss": 7.237061977386475, "epoch": 2.23, "learning_rate": 2.578947368421052e-05, "loss": 7.9343, "step": 890, "task_loss": 1.05108642578125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023158727504999997, "compression/movement_sparsity/importance_threshold": -0.00029168899248338765, "compression/movement_sparsity/linear_layer_sparsity": 0.6273384569783198, "compression/movement_sparsity/model_sparsity": 0.5639618676461849, "compression_loss": 6.284049987792969, "distillation_loss": 7.010084629058838, "epoch": 2.26, "learning_rate": 2.6081871345029238e-05, "loss": 8.0004, "step": 900, "task_loss": 1.2064743041992188 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023493534155000006, "compression/movement_sparsity/importance_threshold": -0.0002858901780265683, "compression/movement_sparsity/linear_layer_sparsity": 0.6380538500263475, "compression/movement_sparsity/model_sparsity": 0.5735947428648286, "compression_loss": 6.374482154846191, "distillation_loss": 7.233295440673828, "epoch": 2.28, "learning_rate": 2.637426900584795e-05, "loss": 8.1421, "step": 910, "task_loss": 1.3159103393554688 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023823873805, "compression/movement_sparsity/importance_threshold": -0.00028016873151981403, "compression/movement_sparsity/linear_layer_sparsity": 0.6493039794113219, "compression/movement_sparsity/model_sparsity": 0.5837083329191853, "compression_loss": 6.463593482971191, "distillation_loss": 6.789806365966797, "epoch": 2.31, "learning_rate": 2.6666666666666663e-05, "loss": 8.1579, "step": 920, "task_loss": 1.0103607177734375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024149776455000004, "compression/movement_sparsity/importance_threshold": -0.000274524133366415, "compression/movement_sparsity/linear_layer_sparsity": 0.6626772691772057, "compression/movement_sparsity/model_sparsity": 0.595730592018793, "compression_loss": 6.551351070404053, "distillation_loss": 7.41424560546875, "epoch": 2.33, "learning_rate": 2.6959064327485376e-05, "loss": 8.2611, "step": 930, "task_loss": 1.22052001953125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024471272104999997, "compression/movement_sparsity/importance_threshold": -0.0002689558639696618, "compression/movement_sparsity/linear_layer_sparsity": 0.6734886894007829, "compression/movement_sparsity/model_sparsity": 0.6054497933101735, "compression_loss": 6.637966632843018, "distillation_loss": 6.994537353515625, "epoch": 2.36, "learning_rate": 2.725146198830409e-05, "loss": 8.3167, "step": 940, "task_loss": 0.9749984741210938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024788390755, "compression/movement_sparsity/importance_threshold": -0.0002634634037328445, "compression/movement_sparsity/linear_layer_sparsity": 0.6839432776272207, "compression/movement_sparsity/model_sparsity": 0.6148482113986368, "compression_loss": 6.723362445831299, "distillation_loss": 6.864375591278076, "epoch": 2.38, "learning_rate": 2.75438596491228e-05, "loss": 8.4735, "step": 950, "task_loss": 1.2460098266601562 }, { "compression/movement_sparsity/importance_regularization_factor": 0.025101162405, "compression/movement_sparsity/importance_threshold": -0.0002580462330592536, "compression/movement_sparsity/linear_layer_sparsity": 0.6961206503124059, "compression/movement_sparsity/model_sparsity": 0.6257953704101208, "compression_loss": 6.807560920715332, "distillation_loss": 6.893831253051758, "epoch": 2.41, "learning_rate": 2.7836257309941518e-05, "loss": 8.546, "step": 960, "task_loss": 1.1099472045898438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.025409617055, "compression/movement_sparsity/importance_threshold": -0.0002527038323521795, "compression/movement_sparsity/linear_layer_sparsity": 0.7067540368112014, "compression/movement_sparsity/model_sparsity": 0.6353545237547908, "compression_loss": 6.8906145095825195, "distillation_loss": 6.39096736907959, "epoch": 2.43, "learning_rate": 2.812865497076023e-05, "loss": 8.6131, "step": 970, "task_loss": 1.100616455078125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.025713784705, "compression/movement_sparsity/importance_threshold": -0.0002474356820149125, "compression/movement_sparsity/linear_layer_sparsity": 0.7156849287676904, "compression/movement_sparsity/model_sparsity": 0.6433831763130731, "compression_loss": 6.972564220428467, "distillation_loss": 6.60116720199585, "epoch": 2.46, "learning_rate": 2.8421052631578943e-05, "loss": 8.6269, "step": 980, "task_loss": 1.0101547241210938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026013695355, "compression/movement_sparsity/importance_threshold": -0.00024224126245074297, "compression/movement_sparsity/linear_layer_sparsity": 0.7266012213941584, "compression/movement_sparsity/model_sparsity": 0.6531966553193631, "compression_loss": 7.053177833557129, "distillation_loss": 6.616091728210449, "epoch": 2.48, "learning_rate": 2.8713450292397656e-05, "loss": 8.7434, "step": 990, "task_loss": 1.01708984375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026309379005, "compression/movement_sparsity/importance_threshold": -0.00023712005406296125, "compression/movement_sparsity/linear_layer_sparsity": 0.7362348854825354, "compression/movement_sparsity/model_sparsity": 0.6618570827666554, "compression_loss": 7.132675647735596, "distillation_loss": 6.928311347961426, "epoch": 2.51, "learning_rate": 2.900584795321637e-05, "loss": 8.7891, "step": 1000, "task_loss": 1.1827316284179688 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026600865655, "compression/movement_sparsity/importance_threshold": -0.00023207153725485788, "compression/movement_sparsity/linear_layer_sparsity": 0.7443839746123155, "compression/movement_sparsity/model_sparsity": 0.6691829137819935, "compression_loss": 7.211142063140869, "distillation_loss": 6.722048759460449, "epoch": 2.53, "learning_rate": 2.9298245614035085e-05, "loss": 8.8333, "step": 1010, "task_loss": 1.0985260009765625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026888185305, "compression/movement_sparsity/importance_threshold": -0.000227095192429723, "compression/movement_sparsity/linear_layer_sparsity": 0.7514503702762723, "compression/movement_sparsity/model_sparsity": 0.6755354299586155, "compression_loss": 7.288491249084473, "distillation_loss": 6.20843505859375, "epoch": 2.56, "learning_rate": 2.9590643274853797e-05, "loss": 8.9245, "step": 1020, "task_loss": 0.8749008178710938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027171367955, "compression/movement_sparsity/importance_threshold": -0.000222190499990847, "compression/movement_sparsity/linear_layer_sparsity": 0.7583029749134297, "compression/movement_sparsity/model_sparsity": 0.6816957532520842, "compression_loss": 7.364686965942383, "distillation_loss": 7.18936014175415, "epoch": 2.58, "learning_rate": 2.988304093567251e-05, "loss": 8.9874, "step": 1030, "task_loss": 1.1425094604492188 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027450443605000003, "compression/movement_sparsity/importance_threshold": -0.0002173569403415204, "compression/movement_sparsity/linear_layer_sparsity": 0.7656279993789521, "compression/movement_sparsity/model_sparsity": 0.6882807703703202, "compression_loss": 7.439706802368164, "distillation_loss": 7.14259672164917, "epoch": 2.61, "learning_rate": 3.0175438596491223e-05, "loss": 9.1133, "step": 1040, "task_loss": 1.4981765747070312 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027725442255, "compression/movement_sparsity/importance_threshold": -0.00021259399388503343, "compression/movement_sparsity/linear_layer_sparsity": 0.7736612066207468, "compression/movement_sparsity/model_sparsity": 0.6955024264140018, "compression_loss": 7.513519763946533, "distillation_loss": 6.862635612487793, "epoch": 2.63, "learning_rate": 3.0467836257309936e-05, "loss": 9.1331, "step": 1050, "task_loss": 0.7846755981445312 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027996393905000003, "compression/movement_sparsity/importance_threshold": -0.00020790114102467645, "compression/movement_sparsity/linear_layer_sparsity": 0.7802128970942487, "compression/movement_sparsity/model_sparsity": 0.7013922352637143, "compression_loss": 7.586307525634766, "distillation_loss": 6.549140930175781, "epoch": 2.66, "learning_rate": 3.076023391812865e-05, "loss": 9.1899, "step": 1060, "task_loss": 1.2988433837890625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028263328555, "compression/movement_sparsity/importance_threshold": -0.0002032778621637399, "compression/movement_sparsity/linear_layer_sparsity": 0.7866864625677507, "compression/movement_sparsity/model_sparsity": 0.707211811656896, "compression_loss": 7.65805196762085, "distillation_loss": 6.931570529937744, "epoch": 2.68, "learning_rate": 3.1052631578947365e-05, "loss": 9.266, "step": 1070, "task_loss": 0.9027557373046875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028526276205, "compression/movement_sparsity/importance_threshold": -0.0001987236377055142, "compression/movement_sparsity/linear_layer_sparsity": 0.7926764458182777, "compression/movement_sparsity/model_sparsity": 0.7125966595066641, "compression_loss": 7.728639125823975, "distillation_loss": 6.96481990814209, "epoch": 2.71, "learning_rate": 3.134502923976608e-05, "loss": 9.2969, "step": 1080, "task_loss": 1.0967559814453125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028785266855, "compression/movement_sparsity/importance_threshold": -0.0001942379480532894, "compression/movement_sparsity/linear_layer_sparsity": 0.7986948937631737, "compression/movement_sparsity/model_sparsity": 0.7180070964176795, "compression_loss": 7.7981181144714355, "distillation_loss": 6.923816204071045, "epoch": 2.73, "learning_rate": 3.163742690058479e-05, "loss": 9.3504, "step": 1090, "task_loss": 0.900115966796875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029040330505000003, "compression/movement_sparsity/importance_threshold": -0.0001898202736103562, "compression/movement_sparsity/linear_layer_sparsity": 0.8043253985057212, "compression/movement_sparsity/model_sparsity": 0.7230687819162739, "compression_loss": 7.866626739501953, "distillation_loss": 6.877065658569336, "epoch": 2.76, "learning_rate": 3.19298245614035e-05, "loss": 9.4694, "step": 1100, "task_loss": 1.219451904296875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029291497154999995, "compression/movement_sparsity/importance_threshold": -0.000185470094780005, "compression/movement_sparsity/linear_layer_sparsity": 0.8090906823998796, "compression/movement_sparsity/model_sparsity": 0.7273526551188803, "compression_loss": 7.934044361114502, "distillation_loss": 7.070435047149658, "epoch": 2.78, "learning_rate": 3.2222222222222216e-05, "loss": 9.5107, "step": 1110, "task_loss": 1.032562255859375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029538796805, "compression/movement_sparsity/importance_threshold": -0.0001811868919655257, "compression/movement_sparsity/linear_layer_sparsity": 0.8151365364348088, "compression/movement_sparsity/model_sparsity": 0.7327877294318403, "compression_loss": 8.000240325927734, "distillation_loss": 6.940860748291016, "epoch": 2.81, "learning_rate": 3.251461988304093e-05, "loss": 9.5373, "step": 1120, "task_loss": 0.9963302612304688 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029782259455, "compression/movement_sparsity/importance_threshold": -0.00017697014557020913, "compression/movement_sparsity/linear_layer_sparsity": 0.8208416022094249, "compression/movement_sparsity/model_sparsity": 0.7379164434673136, "compression_loss": 8.065471649169922, "distillation_loss": 6.221469879150391, "epoch": 2.83, "learning_rate": 3.280701754385964e-05, "loss": 9.6332, "step": 1130, "task_loss": 0.7717514038085938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030021915105000005, "compression/movement_sparsity/importance_threshold": -0.00017281933599734545, "compression/movement_sparsity/linear_layer_sparsity": 0.8245439767765733, "compression/movement_sparsity/model_sparsity": 0.74124478728617, "compression_loss": 8.129778861999512, "distillation_loss": 6.495223045349121, "epoch": 2.86, "learning_rate": 3.309941520467836e-05, "loss": 9.667, "step": 1140, "task_loss": 0.9031600952148438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030257793755, "compression/movement_sparsity/importance_threshold": -0.00016873394365022514, "compression/movement_sparsity/linear_layer_sparsity": 0.8293295388249021, "compression/movement_sparsity/model_sparsity": 0.745546890051417, "compression_loss": 8.193017959594727, "distillation_loss": 5.940375804901123, "epoch": 2.88, "learning_rate": 3.3391812865497073e-05, "loss": 9.6598, "step": 1150, "task_loss": 1.1773223876953125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030489925404999996, "compression/movement_sparsity/importance_threshold": -0.00016471344893213846, "compression/movement_sparsity/linear_layer_sparsity": 0.8339565689927733, "compression/movement_sparsity/model_sparsity": 0.7497064765491057, "compression_loss": 8.255036354064941, "distillation_loss": 6.499246597290039, "epoch": 2.91, "learning_rate": 3.3684210526315786e-05, "loss": 9.7623, "step": 1160, "task_loss": 0.774017333984375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030718340054999994, "compression/movement_sparsity/importance_threshold": -0.0001607573322463758, "compression/movement_sparsity/linear_layer_sparsity": 0.8392098341990364, "compression/movement_sparsity/model_sparsity": 0.7544290329682275, "compression_loss": 8.315759658813477, "distillation_loss": 6.641131401062012, "epoch": 2.93, "learning_rate": 3.39766081871345e-05, "loss": 9.9, "step": 1170, "task_loss": 0.9453201293945312 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030943067705000004, "compression/movement_sparsity/importance_threshold": -0.0001568650739962274, "compression/movement_sparsity/linear_layer_sparsity": 0.8446625169376694, "compression/movement_sparsity/model_sparsity": 0.7593308608519719, "compression_loss": 8.375600814819336, "distillation_loss": 6.510258674621582, "epoch": 2.96, "learning_rate": 3.426900584795321e-05, "loss": 9.8962, "step": 1180, "task_loss": 0.84234619140625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.031164138355, "compression/movement_sparsity/importance_threshold": -0.00015303615458498387, "compression/movement_sparsity/linear_layer_sparsity": 0.8489380198923517, "compression/movement_sparsity/model_sparsity": 0.7631744330172485, "compression_loss": 8.434557914733887, "distillation_loss": 7.006261825561523, "epoch": 2.98, "learning_rate": 3.4561403508771924e-05, "loss": 10.013, "step": 1190, "task_loss": 1.2601242065429688 }, { "epoch": 3.0, "eval_accuracy": 0.8068549573403943, "eval_loss": 9.877946853637695, "eval_runtime": 32.5023, "eval_samples_per_second": 209.154, "eval_steps_per_second": 3.292, "step": 1197 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03140312804, "compression/movement_sparsity/importance_threshold": -0.00014889687945287796, "compression/movement_sparsity/linear_layer_sparsity": 0.8525809597071665, "compression/movement_sparsity/model_sparsity": 0.7664493464532609, "compression_loss": 8.498291015625, "distillation_loss": 6.154516220092773, "epoch": 3.01, "learning_rate": 3.485380116959064e-05, "loss": 10.2572, "step": 1200, "task_loss": 0.9739151000976562 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03161661664, "compression/movement_sparsity/importance_threshold": -0.00014519928031604462, "compression/movement_sparsity/linear_layer_sparsity": 0.8557199803334838, "compression/movement_sparsity/model_sparsity": 0.7692712489132579, "compression_loss": 8.555134773254395, "distillation_loss": 6.301018714904785, "epoch": 3.03, "learning_rate": 3.514619883040936e-05, "loss": 10.0509, "step": 1210, "task_loss": 1.0142364501953125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03182654124, "compression/movement_sparsity/importance_threshold": -0.000141563409268316, "compression/movement_sparsity/linear_layer_sparsity": 0.8595924255683529, "compression/movement_sparsity/model_sparsity": 0.7727524820860711, "compression_loss": 8.611062049865723, "distillation_loss": 6.52682638168335, "epoch": 3.06, "learning_rate": 3.543859649122806e-05, "loss": 10.0881, "step": 1220, "task_loss": 0.840057373046875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03203293184, "compression/movement_sparsity/importance_threshold": -0.0001379887467129826, "compression/movement_sparsity/linear_layer_sparsity": 0.862141885915387, "compression/movement_sparsity/model_sparsity": 0.7750443843324734, "compression_loss": 8.665979385375977, "distillation_loss": 6.351391792297363, "epoch": 3.08, "learning_rate": 3.573099415204678e-05, "loss": 10.1386, "step": 1230, "task_loss": 1.0308151245117188 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03223581844, "compression/movement_sparsity/importance_threshold": -0.00013447477305333487, "compression/movement_sparsity/linear_layer_sparsity": 0.8653379417908762, "compression/movement_sparsity/model_sparsity": 0.7779175600808953, "compression_loss": 8.719856262207031, "distillation_loss": 6.51836633682251, "epoch": 3.11, "learning_rate": 3.6023391812865495e-05, "loss": 10.2182, "step": 1240, "task_loss": 0.9089126586914062 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03243523104, "compression/movement_sparsity/importance_threshold": -0.00013102096869266311, "compression/movement_sparsity/linear_layer_sparsity": 0.8683791003274616, "compression/movement_sparsity/model_sparsity": 0.7806514869253643, "compression_loss": 8.7727632522583, "distillation_loss": 5.789917945861816, "epoch": 3.13, "learning_rate": 3.631578947368421e-05, "loss": 10.2812, "step": 1250, "task_loss": 1.0487136840820312 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03263119964, "compression/movement_sparsity/importance_threshold": -0.00012762681403425769, "compression/movement_sparsity/linear_layer_sparsity": 0.8715998452085215, "compression/movement_sparsity/model_sparsity": 0.7835468574835209, "compression_loss": 8.824639320373535, "distillation_loss": 6.552893161773682, "epoch": 3.16, "learning_rate": 3.660818713450292e-05, "loss": 10.2635, "step": 1260, "task_loss": 0.8233566284179688 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03282375424, "compression/movement_sparsity/importance_threshold": -0.00012429178948140886, "compression/movement_sparsity/linear_layer_sparsity": 0.8750408621273713, "compression/movement_sparsity/model_sparsity": 0.7866402471945613, "compression_loss": 8.875518798828125, "distillation_loss": 6.6541595458984375, "epoch": 3.18, "learning_rate": 3.690058479532163e-05, "loss": 10.3797, "step": 1270, "task_loss": 1.2135162353515625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03301292484, "compression/movement_sparsity/importance_threshold": -0.00012101537543740711, "compression/movement_sparsity/linear_layer_sparsity": 0.8768676485809997, "compression/movement_sparsity/model_sparsity": 0.7882824833570649, "compression_loss": 8.925738334655762, "distillation_loss": 6.410771369934082, "epoch": 3.21, "learning_rate": 3.7192982456140346e-05, "loss": 10.3435, "step": 1280, "task_loss": 0.742523193359375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03319874144, "compression/movement_sparsity/importance_threshold": -0.00011779705230554292, "compression/movement_sparsity/linear_layer_sparsity": 0.8788982516561277, "compression/movement_sparsity/model_sparsity": 0.7901079456572931, "compression_loss": 8.975088119506836, "distillation_loss": 6.352423191070557, "epoch": 3.23, "learning_rate": 3.7485380116959065e-05, "loss": 10.408, "step": 1290, "task_loss": 1.1050643920898438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03338123404, "compression/movement_sparsity/importance_threshold": -0.00011463630048910635, "compression/movement_sparsity/linear_layer_sparsity": 0.8814144247779283, "compression/movement_sparsity/model_sparsity": 0.7923699235056253, "compression_loss": 9.023488998413086, "distillation_loss": 6.480988025665283, "epoch": 3.26, "learning_rate": 3.777777777777777e-05, "loss": 10.471, "step": 1300, "task_loss": 1.1249465942382812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.033560432640000006, "compression/movement_sparsity/importance_threshold": -0.00011153260039138788, "compression/movement_sparsity/linear_layer_sparsity": 0.8827950212661849, "compression/movement_sparsity/model_sparsity": 0.7936110458460811, "compression_loss": 9.071030616760254, "distillation_loss": 6.111713409423828, "epoch": 3.28, "learning_rate": 3.807017543859649e-05, "loss": 10.4836, "step": 1310, "task_loss": 0.922943115234375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03373636724, "compression/movement_sparsity/importance_threshold": -0.00010848543241567799, "compression/movement_sparsity/linear_layer_sparsity": 0.8846154796183379, "compression/movement_sparsity/model_sparsity": 0.7952475932007538, "compression_loss": 9.117635726928711, "distillation_loss": 6.498852729797363, "epoch": 3.31, "learning_rate": 3.83625730994152e-05, "loss": 10.5445, "step": 1320, "task_loss": 1.1216354370117188 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03390906784, "compression/movement_sparsity/importance_threshold": -0.00010549427696526705, "compression/movement_sparsity/linear_layer_sparsity": 0.8863515201558265, "compression/movement_sparsity/model_sparsity": 0.7968082510130413, "compression_loss": 9.163419723510742, "distillation_loss": 6.021423816680908, "epoch": 3.33, "learning_rate": 3.8654970760233916e-05, "loss": 10.5338, "step": 1330, "task_loss": 0.6783676147460938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03407856444, "compression/movement_sparsity/importance_threshold": -0.00010255861444344522, "compression/movement_sparsity/linear_layer_sparsity": 0.88778342366757, "compression/movement_sparsity/model_sparsity": 0.7980954971076959, "compression_loss": 9.208226203918457, "distillation_loss": 6.7269744873046875, "epoch": 3.36, "learning_rate": 3.894736842105263e-05, "loss": 10.5765, "step": 1340, "task_loss": 1.1303558349609375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03424488704, "compression/movement_sparsity/importance_threshold": -9.96779252535031e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8894497609906654, "compression/movement_sparsity/model_sparsity": 0.7995934934419042, "compression_loss": 9.252321243286133, "distillation_loss": 6.461292743682861, "epoch": 3.38, "learning_rate": 3.923976608187134e-05, "loss": 10.6448, "step": 1350, "task_loss": 1.000396728515625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.034408065640000006, "compression/movement_sparsity/importance_threshold": -9.685168979873073e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8906103324488106, "compression/movement_sparsity/model_sparsity": 0.8006368186833138, "compression_loss": 9.295612335205078, "distillation_loss": 6.224025726318359, "epoch": 3.41, "learning_rate": 3.9532163742690055e-05, "loss": 10.6635, "step": 1360, "task_loss": 0.9190139770507812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03456813024, "compression/movement_sparsity/importance_threshold": -9.407938848241882e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8919698274239687, "compression/movement_sparsity/model_sparsity": 0.8018589712816715, "compression_loss": 9.337968826293945, "distillation_loss": 5.790504455566406, "epoch": 3.43, "learning_rate": 3.982456140350877e-05, "loss": 10.6898, "step": 1370, "task_loss": 1.1337509155273438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03472511084, "compression/movement_sparsity/importance_threshold": -9.136050170785762e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8939680372064137, "compression/movement_sparsity/model_sparsity": 0.8036553128072406, "compression_loss": 9.379193305969238, "distillation_loss": 5.9002180099487305, "epoch": 3.46, "learning_rate": 4.011695906432748e-05, "loss": 10.7616, "step": 1380, "task_loss": 0.8615188598632812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03487903744, "compression/movement_sparsity/importance_threshold": -8.869450987833742e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8949692181383619, "compression/movement_sparsity/model_sparsity": 0.8045553498796574, "compression_loss": 9.419586181640625, "distillation_loss": 6.334925651550293, "epoch": 3.48, "learning_rate": 4.04093567251462e-05, "loss": 10.8374, "step": 1390, "task_loss": 0.8665771484375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03502994004, "compression/movement_sparsity/importance_threshold": -8.608089339714857e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8967895235809997, "compression/movement_sparsity/model_sparsity": 0.8061917597724307, "compression_loss": 9.45916748046875, "distillation_loss": 6.124588966369629, "epoch": 3.51, "learning_rate": 4.0701754385964906e-05, "loss": 10.8253, "step": 1400, "task_loss": 1.02923583984375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03517784864, "compression/movement_sparsity/importance_threshold": -8.351913266758156e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8980414643556158, "compression/movement_sparsity/model_sparsity": 0.8073172237856457, "compression_loss": 9.49764633178711, "distillation_loss": 6.059030532836914, "epoch": 3.53, "learning_rate": 4.0994152046783625e-05, "loss": 10.8515, "step": 1410, "task_loss": 0.8450164794921875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035322793240000004, "compression/movement_sparsity/importance_threshold": -8.100870809292667e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.8997447234454984, "compression/movement_sparsity/model_sparsity": 0.8088484118815298, "compression_loss": 9.53516960144043, "distillation_loss": 5.919647216796875, "epoch": 3.56, "learning_rate": 4.128654970760233e-05, "loss": 10.9353, "step": 1420, "task_loss": 0.7730026245117188 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03546480384, "compression/movement_sparsity/importance_threshold": -7.854910007647426e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.901417294771906, "compression/movement_sparsity/model_sparsity": 0.8103520124316312, "compression_loss": 9.572003364562988, "distillation_loss": 6.195030212402344, "epoch": 3.58, "learning_rate": 4.157894736842105e-05, "loss": 10.923, "step": 1430, "task_loss": 0.8358230590820312 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035603910440000004, "compression/movement_sparsity/importance_threshold": -7.613978902151471e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.902864101080247, "compression/movement_sparsity/model_sparsity": 0.8116526557744678, "compression_loss": 9.60808277130127, "distillation_loss": 6.326425552368164, "epoch": 3.61, "learning_rate": 4.187134502923976e-05, "loss": 10.9609, "step": 1440, "task_loss": 0.6175155639648438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035740143040000005, "compression/movement_sparsity/importance_threshold": -7.378025533133828e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.904185298102981, "compression/movement_sparsity/model_sparsity": 0.8128403794540563, "compression_loss": 9.643380165100098, "distillation_loss": 5.781469821929932, "epoch": 3.63, "learning_rate": 4.2163742690058476e-05, "loss": 10.9417, "step": 1450, "task_loss": 0.5803070068359375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035873531640000005, "compression/movement_sparsity/importance_threshold": -7.146997940923568e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.904882753688648, "compression/movement_sparsity/model_sparsity": 0.813467374898569, "compression_loss": 9.678108215332031, "distillation_loss": 5.847672939300537, "epoch": 3.66, "learning_rate": 4.245614035087719e-05, "loss": 11.0408, "step": 1460, "task_loss": 0.71551513671875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03600410624, "compression/movement_sparsity/importance_threshold": -6.920844165849685e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.906142151742698, "compression/movement_sparsity/model_sparsity": 0.8145995428228711, "compression_loss": 9.712223052978516, "distillation_loss": 5.703580856323242, "epoch": 3.68, "learning_rate": 4.27485380116959e-05, "loss": 11.0421, "step": 1470, "task_loss": 0.7837600708007812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.036131896840000004, "compression/movement_sparsity/importance_threshold": -6.699512248241237e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9071853710290575, "compression/movement_sparsity/model_sparsity": 0.8155373713435928, "compression_loss": 9.745464324951172, "distillation_loss": 5.9033050537109375, "epoch": 3.71, "learning_rate": 4.3040935672514615e-05, "loss": 11.0491, "step": 1480, "task_loss": 0.887847900390625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03625693344, "compression/movement_sparsity/importance_threshold": -6.482950228427252e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9080575316169829, "compression/movement_sparsity/model_sparsity": 0.8163214222950089, "compression_loss": 9.778051376342773, "distillation_loss": 6.591049671173096, "epoch": 3.73, "learning_rate": 4.3333333333333334e-05, "loss": 11.08, "step": 1490, "task_loss": 0.889251708984375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03637924604, "compression/movement_sparsity/importance_threshold": -6.271106146736789e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9087722777401385, "compression/movement_sparsity/model_sparsity": 0.8169639615081304, "compression_loss": 9.810018539428711, "distillation_loss": 5.86897087097168, "epoch": 3.76, "learning_rate": 4.362573099415204e-05, "loss": 11.0899, "step": 1500, "task_loss": 0.8062057495117188 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03649886464, "compression/movement_sparsity/importance_threshold": -6.063928043498841e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9095836509146341, "compression/movement_sparsity/model_sparsity": 0.8176933660675936, "compression_loss": 9.841254234313965, "distillation_loss": 5.643402576446533, "epoch": 3.78, "learning_rate": 4.391812865497076e-05, "loss": 11.1397, "step": 1510, "task_loss": 0.72369384765625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03661581924, "compression/movement_sparsity/importance_threshold": -5.861363959042479e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.910283776535682, "compression/movement_sparsity/model_sparsity": 0.8183227618083473, "compression_loss": 9.871673583984375, "distillation_loss": 5.854546070098877, "epoch": 3.81, "learning_rate": 4.4210526315789466e-05, "loss": 11.1555, "step": 1520, "task_loss": 0.75244140625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03673013984, "compression/movement_sparsity/importance_threshold": -5.663361933696729e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9109775505307136, "compression/movement_sparsity/model_sparsity": 0.8189464475932854, "compression_loss": 9.901331901550293, "distillation_loss": 5.80916690826416, "epoch": 3.83, "learning_rate": 4.4502923976608185e-05, "loss": 11.2043, "step": 1530, "task_loss": 0.7809829711914062 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03684185644, "compression/movement_sparsity/importance_threshold": -5.4698700077906396e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9116742415688046, "compression/movement_sparsity/model_sparsity": 0.8195727557283017, "compression_loss": 9.930371284484863, "distillation_loss": 6.172576427459717, "epoch": 3.86, "learning_rate": 4.479532163742689e-05, "loss": 11.307, "step": 1540, "task_loss": 0.8880844116210938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03695099904, "compression/movement_sparsity/importance_threshold": -5.280836221653216e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9124707001844324, "compression/movement_sparsity/model_sparsity": 0.8202887524655906, "compression_loss": 9.958555221557617, "distillation_loss": 6.095415115356445, "epoch": 3.88, "learning_rate": 4.508771929824561e-05, "loss": 11.2818, "step": 1550, "task_loss": 0.8449211120605469 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03705759764, "compression/movement_sparsity/importance_threshold": -5.096208615613528e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9127668976776573, "compression/movement_sparsity/model_sparsity": 0.8205550267384542, "compression_loss": 9.986018180847168, "distillation_loss": 6.143268585205078, "epoch": 3.91, "learning_rate": 4.538011695906432e-05, "loss": 11.2738, "step": 1560, "task_loss": 0.9170875549316406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03716168224, "compression/movement_sparsity/importance_threshold": -4.915935230000591e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9140392577536887, "compression/movement_sparsity/model_sparsity": 0.8216988472022169, "compression_loss": 10.012683868408203, "distillation_loss": 6.018669128417969, "epoch": 3.93, "learning_rate": 4.5672514619883036e-05, "loss": 11.2507, "step": 1570, "task_loss": 0.5751380920410156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03726328284, "compression/movement_sparsity/importance_threshold": -4.7399641051434536e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.915082559375941, "compression/movement_sparsity/model_sparsity": 0.8226367497408843, "compression_loss": 10.038643836975098, "distillation_loss": 6.239258289337158, "epoch": 3.96, "learning_rate": 4.596491228070175e-05, "loss": 11.3294, "step": 1580, "task_loss": 0.8149871826171875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03736242944, "compression/movement_sparsity/importance_threshold": -4.5682432813711534e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9154963913354411, "compression/movement_sparsity/model_sparsity": 0.8230087745102497, "compression_loss": 10.064014434814453, "distillation_loss": 6.248401165008545, "epoch": 3.98, "learning_rate": 4.625730994152047e-05, "loss": 11.3484, "step": 1590, "task_loss": 0.6874847412109375 }, { "epoch": 4.0, "eval_accuracy": 0.8718740806119447, "eval_loss": 11.194880485534668, "eval_runtime": 32.6266, "eval_samples_per_second": 208.358, "eval_steps_per_second": 3.28, "step": 1596 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037468692134999995, "compression/movement_sparsity/importance_threshold": -4.384197459107297e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9165756031692261, "compression/movement_sparsity/model_sparsity": 0.8239789594472594, "compression_loss": 10.091094970703125, "distillation_loss": 5.910013198852539, "epoch": 4.01, "learning_rate": 4.6549707602339174e-05, "loss": 11.6289, "step": 1600, "task_loss": 0.7885818481445312 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037562782985, "compression/movement_sparsity/importance_threshold": -4.2212331388841303e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.917558799589732, "compression/movement_sparsity/model_sparsity": 0.8248628288855245, "compression_loss": 10.114706993103027, "distillation_loss": 5.721433639526367, "epoch": 4.04, "learning_rate": 4.6842105263157894e-05, "loss": 11.3734, "step": 1610, "task_loss": 0.7978591918945312 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037654512835, "compression/movement_sparsity/importance_threshold": -4.06235804476583e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9183277109680819, "compression/movement_sparsity/model_sparsity": 0.8255540613329617, "compression_loss": 10.137986183166504, "distillation_loss": 6.493091583251953, "epoch": 4.06, "learning_rate": 4.71345029239766e-05, "loss": 11.4261, "step": 1620, "task_loss": 0.7956275939941406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037743911685000006, "compression/movement_sparsity/importance_threshold": -3.9075202170813905e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9188648232836495, "compression/movement_sparsity/model_sparsity": 0.8260369121151093, "compression_loss": 10.1608304977417, "distillation_loss": 5.440267562866211, "epoch": 4.09, "learning_rate": 4.742690058479532e-05, "loss": 11.4415, "step": 1630, "task_loss": 0.6993637084960938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037831009535, "compression/movement_sparsity/importance_threshold": -3.756667696159881e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9194862005043661, "compression/movement_sparsity/model_sparsity": 0.8265955149777428, "compression_loss": 10.182988166809082, "distillation_loss": 5.837116241455078, "epoch": 4.11, "learning_rate": 4.7719298245614025e-05, "loss": 11.3917, "step": 1640, "task_loss": 0.6603546142578125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037915836385, "compression/movement_sparsity/importance_threshold": -3.6097485223303286e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.919930038015658, "compression/movement_sparsity/model_sparsity": 0.8269945140013404, "compression_loss": 10.204331398010254, "distillation_loss": 5.721419811248779, "epoch": 4.14, "learning_rate": 4.8011695906432745e-05, "loss": 11.4511, "step": 1650, "task_loss": 0.7290534973144531 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037998422235, "compression/movement_sparsity/importance_threshold": -3.466710735921738e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9208434371236074, "compression/movement_sparsity/model_sparsity": 0.8278156373695883, "compression_loss": 10.22504711151123, "distillation_loss": 5.501497745513916, "epoch": 4.16, "learning_rate": 4.830409356725146e-05, "loss": 11.4797, "step": 1660, "task_loss": 0.6077766418457031 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038078797085000005, "compression/movement_sparsity/importance_threshold": -3.32750237726319e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9213056943503463, "compression/movement_sparsity/model_sparsity": 0.8282311952650508, "compression_loss": 10.245244979858398, "distillation_loss": 5.704554557800293, "epoch": 4.19, "learning_rate": 4.859649122807017e-05, "loss": 11.4279, "step": 1670, "task_loss": 0.4684104919433594 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038156990935, "compression/movement_sparsity/importance_threshold": -3.192071486683701e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9218820691245107, "compression/movement_sparsity/model_sparsity": 0.8287493420333322, "compression_loss": 10.264981269836426, "distillation_loss": 5.46170711517334, "epoch": 4.21, "learning_rate": 4.888888888888888e-05, "loss": 11.4706, "step": 1680, "task_loss": 0.5621528625488281 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038233033785, "compression/movement_sparsity/importance_threshold": -3.060366104512308e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.922172020852153, "compression/movement_sparsity/model_sparsity": 0.8290100015163104, "compression_loss": 10.284026145935059, "distillation_loss": 5.784496784210205, "epoch": 4.24, "learning_rate": 4.9181286549707596e-05, "loss": 11.4583, "step": 1690, "task_loss": 0.8902587890625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038306955635, "compression/movement_sparsity/importance_threshold": -2.932334271078048e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9229681265996688, "compression/movement_sparsity/model_sparsity": 0.8297256810338319, "compression_loss": 10.302370071411133, "distillation_loss": 5.93410587310791, "epoch": 4.26, "learning_rate": 4.947368421052631e-05, "loss": 11.504, "step": 1700, "task_loss": 0.7793045043945312 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038378786485, "compression/movement_sparsity/importance_threshold": -2.80792402670997e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9235381615100873, "compression/movement_sparsity/model_sparsity": 0.83023812842029, "compression_loss": 10.32028579711914, "distillation_loss": 5.835970401763916, "epoch": 4.29, "learning_rate": 4.976608187134503e-05, "loss": 11.515, "step": 1710, "task_loss": 0.9279670715332031 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038448556335, "compression/movement_sparsity/importance_threshold": -2.6870834117370787e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.923452214600271, "compression/movement_sparsity/model_sparsity": 0.8301608642589122, "compression_loss": 10.337579727172852, "distillation_loss": 5.414052963256836, "epoch": 4.31, "learning_rate": 5.0058479532163734e-05, "loss": 11.5121, "step": 1720, "task_loss": 0.7787857055664062 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038516295185, "compression/movement_sparsity/importance_threshold": -2.569760466488444e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9243088607535381, "compression/movement_sparsity/model_sparsity": 0.8309309681145494, "compression_loss": 10.353998184204102, "distillation_loss": 5.553279876708984, "epoch": 4.34, "learning_rate": 5.0350877192982454e-05, "loss": 11.5231, "step": 1730, "task_loss": 0.883087158203125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038582033035, "compression/movement_sparsity/importance_threshold": -2.455903231293082e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9252170609379705, "compression/movement_sparsity/model_sparsity": 0.8317474177782224, "compression_loss": 10.369752883911133, "distillation_loss": 5.842885971069336, "epoch": 4.36, "learning_rate": 5.064327485380116e-05, "loss": 11.5901, "step": 1740, "task_loss": 0.6491127014160156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038645799885, "compression/movement_sparsity/importance_threshold": -2.34545974648003e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.925555508506474, "compression/movement_sparsity/model_sparsity": 0.8320516738312513, "compression_loss": 10.385010719299316, "distillation_loss": 5.671030044555664, "epoch": 4.39, "learning_rate": 5.093567251461988e-05, "loss": 11.5727, "step": 1750, "task_loss": 0.6597175598144531 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038707625735, "compression/movement_sparsity/importance_threshold": -2.238378052378336e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9264931496537188, "compression/movement_sparsity/model_sparsity": 0.8328945901975285, "compression_loss": 10.399575233459473, "distillation_loss": 5.7124786376953125, "epoch": 4.41, "learning_rate": 5.122807017543859e-05, "loss": 11.5967, "step": 1760, "task_loss": 0.8300895690917969 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038767540585, "compression/movement_sparsity/importance_threshold": -2.134606189317027e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9267444740853659, "compression/movement_sparsity/model_sparsity": 0.8331205246899547, "compression_loss": 10.414167404174805, "distillation_loss": 5.657853603363037, "epoch": 4.44, "learning_rate": 5.1520467836257305e-05, "loss": 11.6035, "step": 1770, "task_loss": 0.6173133850097656 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038825574435, "compression/movement_sparsity/importance_threshold": -2.0340921976251297e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9271155737165011, "compression/movement_sparsity/model_sparsity": 0.8334541341454723, "compression_loss": 10.428546905517578, "distillation_loss": 5.23508358001709, "epoch": 4.46, "learning_rate": 5.181286549707602e-05, "loss": 11.598, "step": 1780, "task_loss": 0.5253219604492188 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038881757285, "compression/movement_sparsity/importance_threshold": -1.9367841176317138e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9273720264980427, "compression/movement_sparsity/model_sparsity": 0.8336846788985199, "compression_loss": 10.44232177734375, "distillation_loss": 5.694540023803711, "epoch": 4.49, "learning_rate": 5.210526315789473e-05, "loss": 11.6457, "step": 1790, "task_loss": 0.7008819580078125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038936119135, "compression/movement_sparsity/importance_threshold": -1.8426299896657844e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9279394148976212, "compression/movement_sparsity/model_sparsity": 0.8341947471367215, "compression_loss": 10.455520629882812, "distillation_loss": 5.280182838439941, "epoch": 4.51, "learning_rate": 5.239766081871344e-05, "loss": 11.6078, "step": 1800, "task_loss": 0.7581596374511719 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038988689985, "compression/movement_sparsity/importance_threshold": -1.7515778540563897e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9280301725760313, "compression/movement_sparsity/model_sparsity": 0.8342763360609299, "compression_loss": 10.468148231506348, "distillation_loss": 5.653448581695557, "epoch": 4.54, "learning_rate": 5.269005847953216e-05, "loss": 11.6467, "step": 1810, "task_loss": 0.5723075866699219 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039039499835, "compression/movement_sparsity/importance_threshold": -1.6635757511325564e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.928663417927582, "compression/movement_sparsity/model_sparsity": 0.8348456080817446, "compression_loss": 10.480254173278809, "distillation_loss": 5.12320613861084, "epoch": 4.56, "learning_rate": 5.298245614035087e-05, "loss": 11.6528, "step": 1820, "task_loss": 0.6407356262207031 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039088578685, "compression/movement_sparsity/importance_threshold": -1.5785717212233328e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9287730187631738, "compression/movement_sparsity/model_sparsity": 0.834944136541539, "compression_loss": 10.49207592010498, "distillation_loss": 5.552064895629883, "epoch": 4.59, "learning_rate": 5.327485380116959e-05, "loss": 11.6938, "step": 1830, "task_loss": 0.6142463684082031 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039135956535, "compression/movement_sparsity/importance_threshold": -1.4965138046577563e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9293162709838904, "compression/movement_sparsity/model_sparsity": 0.8354325069476416, "compression_loss": 10.503337860107422, "distillation_loss": 5.498279094696045, "epoch": 4.61, "learning_rate": 5.3567251461988294e-05, "loss": 11.6493, "step": 1840, "task_loss": 0.6056175231933594 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039181663385, "compression/movement_sparsity/importance_threshold": -1.4173500417648645e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9297535686728395, "compression/movement_sparsity/model_sparsity": 0.8358256268315476, "compression_loss": 10.513941764831543, "distillation_loss": 5.60037088394165, "epoch": 4.64, "learning_rate": 5.3859649122807013e-05, "loss": 11.6519, "step": 1850, "task_loss": 0.5648880004882812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039225729235, "compression/movement_sparsity/importance_threshold": -1.3410284728736731e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9301191635614273, "compression/movement_sparsity/model_sparsity": 0.8361542876586918, "compression_loss": 10.523796081542969, "distillation_loss": 5.326607704162598, "epoch": 4.66, "learning_rate": 5.4152046783625726e-05, "loss": 11.6778, "step": 1860, "task_loss": 0.7541923522949219 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039268184085, "compression/movement_sparsity/importance_threshold": -1.2674971383132305e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9308487654320987, "compression/movement_sparsity/model_sparsity": 0.8368101818240263, "compression_loss": 10.533379554748535, "distillation_loss": 5.710136413574219, "epoch": 4.69, "learning_rate": 5.444444444444444e-05, "loss": 11.6995, "step": 1870, "task_loss": 0.7020912170410156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039309057935, "compression/movement_sparsity/importance_threshold": -1.1967040784125848e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9314339619090636, "compression/movement_sparsity/model_sparsity": 0.8373362590864959, "compression_loss": 10.542318344116211, "distillation_loss": 5.294053554534912, "epoch": 4.71, "learning_rate": 5.473684210526315e-05, "loss": 11.6845, "step": 1880, "task_loss": 0.6209945678710938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039348380785, "compression/movement_sparsity/importance_threshold": -1.128597333500752e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.93194402100271, "compression/movement_sparsity/model_sparsity": 0.8377947896864664, "compression_loss": 10.55092716217041, "distillation_loss": 5.458676338195801, "epoch": 4.74, "learning_rate": 5.5029239766081864e-05, "loss": 11.65, "step": 1890, "task_loss": 0.45584869384765625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039386182635, "compression/movement_sparsity/importance_threshold": -1.0631249439067912e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9321739498644986, "compression/movement_sparsity/model_sparsity": 0.838001490086988, "compression_loss": 10.55921459197998, "distillation_loss": 5.362114906311035, "epoch": 4.76, "learning_rate": 5.532163742690058e-05, "loss": 11.66, "step": 1900, "task_loss": 0.5351524353027344 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039422493485, "compression/movement_sparsity/importance_threshold": -1.0002349499597181e-05, "compression/movement_sparsity/linear_layer_sparsity": 0.9321951690003011, "compression/movement_sparsity/model_sparsity": 0.8380205655690087, "compression_loss": 10.567584037780762, "distillation_loss": 5.540861129760742, "epoch": 4.79, "learning_rate": 5.56140350877193e-05, "loss": 11.6622, "step": 1910, "task_loss": 0.7154998779296875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039457343335, "compression/movement_sparsity/importance_threshold": -9.398753919885702e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.932436648411623, "compression/movement_sparsity/model_sparsity": 0.8382376496299209, "compression_loss": 10.575636863708496, "distillation_loss": 5.791957378387451, "epoch": 4.81, "learning_rate": 5.5906432748538e-05, "loss": 11.6768, "step": 1920, "task_loss": 0.7495079040527344 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039490762185, "compression/movement_sparsity/importance_threshold": -8.819943103223959e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9328805447342667, "compression/movement_sparsity/model_sparsity": 0.8386367015234797, "compression_loss": 10.583325386047363, "distillation_loss": 5.243807315826416, "epoch": 4.84, "learning_rate": 5.619883040935672e-05, "loss": 11.6939, "step": 1930, "task_loss": 0.3921699523925781 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039522780035, "compression/movement_sparsity/importance_threshold": -8.265397452902326e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9331947385012045, "compression/movement_sparsity/model_sparsity": 0.8389191540044871, "compression_loss": 10.59062385559082, "distillation_loss": 5.702177047729492, "epoch": 4.86, "learning_rate": 5.649122807017543e-05, "loss": 11.687, "step": 1940, "task_loss": 0.665802001953125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039553426885, "compression/movement_sparsity/importance_threshold": -7.734597372211069e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9333125964506173, "compression/movement_sparsity/model_sparsity": 0.8390251054068417, "compression_loss": 10.597086906433105, "distillation_loss": 5.53410530090332, "epoch": 4.89, "learning_rate": 5.678362573099415e-05, "loss": 11.6623, "step": 1950, "task_loss": 0.5931320190429688 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039582732735, "compression/movement_sparsity/importance_threshold": -7.227023264440562e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9338856542645287, "compression/movement_sparsity/model_sparsity": 0.8395402703093083, "compression_loss": 10.602754592895508, "distillation_loss": 6.0722975730896, "epoch": 4.91, "learning_rate": 5.707602339181286e-05, "loss": 11.7102, "step": 1960, "task_loss": 0.7799224853515625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039610727585, "compression/movement_sparsity/importance_threshold": -6.74215553288129e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9343747412300512, "compression/movement_sparsity/model_sparsity": 0.8399799474810953, "compression_loss": 10.608075141906738, "distillation_loss": 5.408370018005371, "epoch": 4.94, "learning_rate": 5.736842105263157e-05, "loss": 11.6947, "step": 1970, "task_loss": 0.6344947814941406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039637441435000004, "compression/movement_sparsity/importance_threshold": -6.2794745808234095e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9346346638813611, "compression/movement_sparsity/model_sparsity": 0.8402136115618569, "compression_loss": 10.613485336303711, "distillation_loss": 5.309972763061523, "epoch": 4.96, "learning_rate": 5.7660818713450286e-05, "loss": 11.7285, "step": 1980, "task_loss": 0.6148605346679688 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039662904285, "compression/movement_sparsity/importance_threshold": -5.838460811557404e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9349967889001807, "compression/movement_sparsity/model_sparsity": 0.8405391530612871, "compression_loss": 10.618734359741211, "distillation_loss": 5.216809272766113, "epoch": 4.99, "learning_rate": 5.7953216374269e-05, "loss": 11.6849, "step": 1990, "task_loss": 0.4887504577636719 }, { "epoch": 5.0, "eval_accuracy": 0.9014416004707266, "eval_loss": 11.547859191894531, "eval_runtime": 33.3462, "eval_samples_per_second": 203.861, "eval_steps_per_second": 3.209, "step": 1995 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03968950432, "compression/movement_sparsity/importance_threshold": -5.377751122816491e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9353497275858175, "compression/movement_sparsity/model_sparsity": 0.8408564362727693, "compression_loss": 10.624021530151367, "distillation_loss": 5.169059753417969, "epoch": 5.01, "learning_rate": 5.824561403508771e-05, "loss": 11.99, "step": 2000, "task_loss": 0.5792617797851562 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03971243772, "compression/movement_sparsity/importance_threshold": -4.980547150123518e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9356789888211382, "compression/movement_sparsity/model_sparsity": 0.8411524340378502, "compression_loss": 10.627915382385254, "distillation_loss": 5.314959526062012, "epoch": 5.04, "learning_rate": 5.853801169590643e-05, "loss": 11.6732, "step": 2010, "task_loss": 0.4535560607910156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03973421312, "compression/movement_sparsity/importance_threshold": -4.603399610422494e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9357462219587473, "compression/movement_sparsity/model_sparsity": 0.8412128749775567, "compression_loss": 10.631778717041016, "distillation_loss": 5.557980537414551, "epoch": 5.06, "learning_rate": 5.883040935672514e-05, "loss": 11.7043, "step": 2020, "task_loss": 0.7558097839355469 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03975486052, "compression/movement_sparsity/importance_threshold": -4.245788907003903e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9361655468985245, "compression/movement_sparsity/model_sparsity": 0.8415898378013033, "compression_loss": 10.635494232177734, "distillation_loss": 5.396326541900635, "epoch": 5.09, "learning_rate": 5.9122807017543856e-05, "loss": 11.6958, "step": 2030, "task_loss": 0.8873405456542969 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03977440992, "compression/movement_sparsity/importance_threshold": -3.907195443158011e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9364012039859982, "compression/movement_sparsity/model_sparsity": 0.8418016877360511, "compression_loss": 10.639056205749512, "distillation_loss": 5.366767406463623, "epoch": 5.11, "learning_rate": 5.941520467836256e-05, "loss": 11.6648, "step": 2040, "task_loss": 0.584869384765625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03979289132, "compression/movement_sparsity/importance_threshold": -3.5870996221751923e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9367240900707619, "compression/movement_sparsity/model_sparsity": 0.8420919543973321, "compression_loss": 10.642599105834961, "distillation_loss": 5.1943817138671875, "epoch": 5.14, "learning_rate": 5.970760233918128e-05, "loss": 11.662, "step": 2050, "task_loss": 0.519378662109375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03981033472, "compression/movement_sparsity/importance_threshold": -3.284981847345822e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9369445150180669, "compression/movement_sparsity/model_sparsity": 0.842290111012115, "compression_loss": 10.645750045776367, "distillation_loss": 5.329228401184082, "epoch": 5.16, "learning_rate": 5.9999999999999995e-05, "loss": 11.6968, "step": 2060, "task_loss": 0.4632720947265625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03982677012, "compression/movement_sparsity/importance_threshold": -3.000322521960275e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9373246480728696, "compression/movement_sparsity/model_sparsity": 0.8426318412936814, "compression_loss": 10.648528099060059, "distillation_loss": 4.981120586395264, "epoch": 5.19, "learning_rate": 6.029239766081871e-05, "loss": 11.758, "step": 2070, "task_loss": 0.34719085693359375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03984222752, "compression/movement_sparsity/importance_threshold": -2.7326020493088172e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9375303231330925, "compression/movement_sparsity/model_sparsity": 0.8428167381221816, "compression_loss": 10.650529861450195, "distillation_loss": 5.14153528213501, "epoch": 5.21, "learning_rate": 6.058479532163742e-05, "loss": 11.6921, "step": 2080, "task_loss": 0.5952033996582031 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039856736920000004, "compression/movement_sparsity/importance_threshold": -2.4813008326819317e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9375435086382113, "compression/movement_sparsity/model_sparsity": 0.8428285915674949, "compression_loss": 10.651772499084473, "distillation_loss": 5.077071189880371, "epoch": 5.24, "learning_rate": 6.087719298245613e-05, "loss": 11.7209, "step": 2090, "task_loss": 0.735076904296875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03987032832, "compression/movement_sparsity/importance_threshold": -2.245899275369885e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9379964266222524, "compression/movement_sparsity/model_sparsity": 0.8432357537131102, "compression_loss": 10.653236389160156, "distillation_loss": 5.161840915679932, "epoch": 5.26, "learning_rate": 6.116959064327485e-05, "loss": 11.6636, "step": 2100, "task_loss": 0.4819183349609375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03988303172, "compression/movement_sparsity/importance_threshold": -2.0258777806631597e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9381804591049383, "compression/movement_sparsity/model_sparsity": 0.8434011943958687, "compression_loss": 10.65453052520752, "distillation_loss": 5.505778789520264, "epoch": 5.29, "learning_rate": 6.146198830409357e-05, "loss": 11.7213, "step": 2110, "task_loss": 0.6404914855957031 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03989487712, "compression/movement_sparsity/importance_threshold": -1.820716751852131e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9386689579569407, "compression/movement_sparsity/model_sparsity": 0.8438403428680431, "compression_loss": 10.65531063079834, "distillation_loss": 5.463746070861816, "epoch": 5.31, "learning_rate": 6.175438596491227e-05, "loss": 11.6959, "step": 2120, "task_loss": 0.4452247619628906 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03990589452, "compression/movement_sparsity/importance_threshold": -1.6298965922269563e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9391311916591388, "compression/movement_sparsity/model_sparsity": 0.8442558796155212, "compression_loss": 10.655555725097656, "distillation_loss": 5.088167190551758, "epoch": 5.34, "learning_rate": 6.204678362573099e-05, "loss": 11.7288, "step": 2130, "task_loss": 0.4614067077636719 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039916113919999996, "compression/movement_sparsity/importance_threshold": -1.452897705078119e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9390296244542307, "compression/movement_sparsity/model_sparsity": 0.8441645731924342, "compression_loss": 10.656309127807617, "distillation_loss": 5.118770122528076, "epoch": 5.36, "learning_rate": 6.230994152046783e-05, "loss": 11.7144, "step": 2140, "task_loss": 0.4471626281738281 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03992556532, "compression/movement_sparsity/importance_threshold": -1.2892004936959937e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9392110104260765, "compression/movement_sparsity/model_sparsity": 0.8443276347269362, "compression_loss": 10.656984329223633, "distillation_loss": 4.791459083557129, "epoch": 5.39, "learning_rate": 6.260233918128654e-05, "loss": 11.7519, "step": 2150, "task_loss": 0.5996971130371094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03993427872, "compression/movement_sparsity/importance_threshold": -1.1382853613710634e-06, "compression/movement_sparsity/linear_layer_sparsity": 0.9395007974819332, "compression/movement_sparsity/model_sparsity": 0.844588146174023, "compression_loss": 10.657218933105469, "distillation_loss": 4.999011039733887, "epoch": 5.41, "learning_rate": 6.289473684210526e-05, "loss": 11.672, "step": 2160, "task_loss": 0.5861434936523438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03994228412, "compression/movement_sparsity/importance_threshold": -9.996327113933777e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9398508896981331, "compression/movement_sparsity/model_sparsity": 0.8449028704793804, "compression_loss": 10.65735912322998, "distillation_loss": 5.359859943389893, "epoch": 5.44, "learning_rate": 6.318713450292397e-05, "loss": 11.7018, "step": 2170, "task_loss": 0.8405075073242188 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039949611520000004, "compression/movement_sparsity/importance_threshold": -8.727229470536363e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9398334580133996, "compression/movement_sparsity/model_sparsity": 0.8448871998228645, "compression_loss": 10.657649993896484, "distillation_loss": 4.696287631988525, "epoch": 5.46, "learning_rate": 6.347953216374269e-05, "loss": 11.6592, "step": 2180, "task_loss": 0.4386634826660156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039956290920000005, "compression/movement_sparsity/importance_threshold": -7.570364716419972e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.940128996819482, "compression/movement_sparsity/model_sparsity": 0.845152881952162, "compression_loss": 10.65783977508545, "distillation_loss": 5.079584121704102, "epoch": 5.49, "learning_rate": 6.377192982456139e-05, "loss": 11.6442, "step": 2190, "task_loss": 0.7108612060546875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03996235232, "compression/movement_sparsity/importance_threshold": -6.520536884488349e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9404820648900933, "compression/movement_sparsity/model_sparsity": 0.8454702814775589, "compression_loss": 10.657474517822266, "distillation_loss": 4.942742347717285, "epoch": 5.51, "learning_rate": 6.406432748538011e-05, "loss": 11.689, "step": 2200, "task_loss": 0.337738037109375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03996782572, "compression/movement_sparsity/importance_threshold": -5.572550007646327e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9405549556797651, "compression/movement_sparsity/model_sparsity": 0.8455358085075381, "compression_loss": 10.656968116760254, "distillation_loss": 5.540010929107666, "epoch": 5.54, "learning_rate": 6.435672514619882e-05, "loss": 11.6767, "step": 2210, "task_loss": 0.5631599426269531 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03997274112, "compression/movement_sparsity/importance_threshold": -4.7212081187965674e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9407192981218007, "compression/movement_sparsity/model_sparsity": 0.8456835483272684, "compression_loss": 10.6561279296875, "distillation_loss": 4.723138809204102, "epoch": 5.56, "learning_rate": 6.464912280701754e-05, "loss": 11.6576, "step": 2220, "task_loss": 0.6410751342773438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03997712852, "compression/movement_sparsity/importance_threshold": -3.961315250842817e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.940927925511894, "compression/movement_sparsity/model_sparsity": 0.8458710992278237, "compression_loss": 10.65514087677002, "distillation_loss": 5.012965202331543, "epoch": 5.59, "learning_rate": 6.494152046783626e-05, "loss": 11.6773, "step": 2230, "task_loss": 0.4655570983886719 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03998101792, "compression/movement_sparsity/importance_threshold": -3.2876754366888226e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9410338682813911, "compression/movement_sparsity/model_sparsity": 0.845966339176028, "compression_loss": 10.654282569885254, "distillation_loss": 5.042932510375977, "epoch": 5.61, "learning_rate": 6.523391812865496e-05, "loss": 11.6747, "step": 2240, "task_loss": 0.5907707214355469 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03998443932, "compression/movement_sparsity/importance_threshold": -2.695092709238331e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9414383727604637, "compression/movement_sparsity/model_sparsity": 0.8463299787695383, "compression_loss": 10.653236389160156, "distillation_loss": 5.0931243896484375, "epoch": 5.64, "learning_rate": 6.552631578947368e-05, "loss": 11.6643, "step": 2250, "task_loss": 0.46851348876953125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03998742272, "compression/movement_sparsity/importance_threshold": -2.1783711013940044e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9416739945611262, "compression/movement_sparsity/model_sparsity": 0.8465417969823095, "compression_loss": 10.651611328125, "distillation_loss": 5.121889114379883, "epoch": 5.66, "learning_rate": 6.581871345029239e-05, "loss": 11.6615, "step": 2260, "task_loss": 0.6543617248535156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03998999812, "compression/movement_sparsity/importance_threshold": -1.732314646060674e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9419127804125264, "compression/movement_sparsity/model_sparsity": 0.8467564595989961, "compression_loss": 10.650047302246094, "distillation_loss": 4.9736175537109375, "epoch": 5.69, "learning_rate": 6.611111111111111e-05, "loss": 11.693, "step": 2270, "task_loss": 0.49425506591796875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999219552, "compression/movement_sparsity/importance_threshold": -1.3517273761420867e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9419647343608852, "compression/movement_sparsity/model_sparsity": 0.8468031649227686, "compression_loss": 10.648653030395508, "distillation_loss": 5.2847442626953125, "epoch": 5.71, "learning_rate": 6.640350877192983e-05, "loss": 11.5899, "step": 2280, "task_loss": 0.4386329650878906 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999404492, "compression/movement_sparsity/importance_threshold": -1.0314133245419889e-07, "compression/movement_sparsity/linear_layer_sparsity": 0.9420043261630533, "compression/movement_sparsity/model_sparsity": 0.8468387569806853, "compression_loss": 10.64761734008789, "distillation_loss": 5.626967430114746, "epoch": 5.74, "learning_rate": 6.669590643274853e-05, "loss": 11.6698, "step": 2290, "task_loss": 0.8205833435058594 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039995576320000004, "compression/movement_sparsity/importance_threshold": -7.66176524161959e-08, "compression/movement_sparsity/linear_layer_sparsity": 0.9422430884899127, "compression/movement_sparsity/model_sparsity": 0.8470533984493874, "compression_loss": 10.646581649780273, "distillation_loss": 5.5033769607543945, "epoch": 5.76, "learning_rate": 6.698830409356725e-05, "loss": 11.6293, "step": 2300, "task_loss": 0.7137565612792969 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999681972, "compression/movement_sparsity/importance_threshold": -5.508210079079121e-08, "compression/movement_sparsity/linear_layer_sparsity": 0.9424840621236074, "compression/movement_sparsity/model_sparsity": 0.8472700278286328, "compression_loss": 10.645450592041016, "distillation_loss": 5.184342384338379, "epoch": 5.79, "learning_rate": 6.728070175438596e-05, "loss": 11.6464, "step": 2310, "task_loss": 0.5423812866210938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999780512, "compression/movement_sparsity/importance_threshold": -3.801508086825108e-08, "compression/movement_sparsity/linear_layer_sparsity": 0.9426952654509184, "compression/movement_sparsity/model_sparsity": 0.847459894433491, "compression_loss": 10.643987655639648, "distillation_loss": 5.054318428039551, "epoch": 5.81, "learning_rate": 6.757309941520468e-05, "loss": 11.5736, "step": 2320, "task_loss": 0.4838676452636719 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039998562519999996, "compression/movement_sparsity/importance_threshold": -2.489699593895018e-08, "compression/movement_sparsity/linear_layer_sparsity": 0.9428045604674796, "compression/movement_sparsity/model_sparsity": 0.8475581479694868, "compression_loss": 10.642233848571777, "distillation_loss": 4.734399318695068, "epoch": 5.84, "learning_rate": 6.786549707602338e-05, "loss": 11.6314, "step": 2330, "task_loss": 0.7984161376953125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999912192, "compression/movement_sparsity/importance_threshold": -1.5208249293263178e-08, "compression/movement_sparsity/linear_layer_sparsity": 0.943118448415387, "compression/movement_sparsity/model_sparsity": 0.8478403255266959, "compression_loss": 10.640581130981445, "distillation_loss": 5.430304527282715, "epoch": 5.86, "learning_rate": 6.81578947368421e-05, "loss": 11.6425, "step": 2340, "task_loss": 0.7073135375976562 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039999513320000006, "compression/movement_sparsity/importance_threshold": -8.429244221456328e-09, "compression/movement_sparsity/linear_layer_sparsity": 0.9431040161096056, "compression/movement_sparsity/model_sparsity": 0.847827351238204, "compression_loss": 10.638731002807617, "distillation_loss": 4.759744644165039, "epoch": 5.89, "learning_rate": 6.845029239766081e-05, "loss": 11.6743, "step": 2350, "task_loss": 0.3960533142089844 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039999766719999996, "compression/movement_sparsity/importance_threshold": -4.040384014121143e-09, "compression/movement_sparsity/linear_layer_sparsity": 0.9434210328214393, "compression/movement_sparsity/model_sparsity": 0.8481123414773517, "compression_loss": 10.636588096618652, "distillation_loss": 4.659360408782959, "epoch": 5.91, "learning_rate": 6.874269005847953e-05, "loss": 11.6125, "step": 2360, "task_loss": 0.5113105773925781 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999991212, "compression/movement_sparsity/importance_threshold": -1.522071961415454e-09, "compression/movement_sparsity/linear_layer_sparsity": 0.9437621504253237, "compression/movement_sparsity/model_sparsity": 0.8484189978266217, "compression_loss": 10.634325981140137, "distillation_loss": 4.611985206604004, "epoch": 5.94, "learning_rate": 6.903508771929823e-05, "loss": 11.6087, "step": 2370, "task_loss": 0.5087242126464844 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999997952, "compression/movement_sparsity/importance_threshold": -3.5471135371393375e-10, "compression/movement_sparsity/linear_layer_sparsity": 0.9440459740100873, "compression/movement_sparsity/model_sparsity": 0.8486741482596372, "compression_loss": 10.631763458251953, "distillation_loss": 5.167237281799316, "epoch": 5.96, "learning_rate": 6.932748538011695e-05, "loss": 11.6086, "step": 2380, "task_loss": 0.4631500244140625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03999999892, "compression/movement_sparsity/importance_threshold": -1.8705481608094487e-11, "compression/movement_sparsity/linear_layer_sparsity": 0.9443540984454983, "compression/movement_sparsity/model_sparsity": 0.8489511445606432, "compression_loss": 10.628874778747559, "distillation_loss": 4.859969139099121, "epoch": 5.99, "learning_rate": 6.961988304093566e-05, "loss": 11.5921, "step": 2390, "task_loss": 0.4661827087402344 }, { "epoch": 6.0, "eval_accuracy": 0.949543983524566, "eval_loss": 11.41934871673584, "eval_runtime": 32.632, "eval_samples_per_second": 208.323, "eval_steps_per_second": 3.279, "step": 2394 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.861173629760742, "epoch": 6.02, "learning_rate": 6.98830409356725e-05, "loss": 5.8657, "step": 2400, "task_loss": 1.1219100952148438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.0635175704956055, "epoch": 6.04, "learning_rate": 6.982456140350876e-05, "loss": 1.3205, "step": 2410, "task_loss": 0.6126556396484375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.927743911743164, "epoch": 6.07, "learning_rate": 6.953216374269006e-05, "loss": 1.2488, "step": 2420, "task_loss": 1.1561241149902344 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.970916748046875, "epoch": 6.09, "learning_rate": 6.923976608187134e-05, "loss": 1.1811, "step": 2430, "task_loss": 0.5452232360839844 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.554920196533203, "epoch": 6.12, "learning_rate": 6.894736842105262e-05, "loss": 1.2511, "step": 2440, "task_loss": 1.2945747375488281 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.541433334350586, "epoch": 6.14, "learning_rate": 6.865497076023391e-05, "loss": 1.1504, "step": 2450, "task_loss": 0.9271202087402344 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.1273722648620605, "epoch": 6.17, "learning_rate": 6.83625730994152e-05, "loss": 1.0371, "step": 2460, "task_loss": 0.2785224914550781 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.357429504394531, "epoch": 6.19, "learning_rate": 6.807017543859647e-05, "loss": 1.1199, "step": 2470, "task_loss": 0.8261871337890625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.609196662902832, "epoch": 6.22, "learning_rate": 6.777777777777777e-05, "loss": 1.1618, "step": 2480, "task_loss": 0.7250289916992188 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.585407733917236, "epoch": 6.24, "learning_rate": 6.748538011695905e-05, "loss": 1.2025, "step": 2490, "task_loss": 0.6468124389648438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.654393672943115, "epoch": 6.27, "learning_rate": 6.719298245614034e-05, "loss": 1.0915, "step": 2500, "task_loss": 0.7687530517578125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.18943977355957, "epoch": 6.29, "learning_rate": 6.690058479532162e-05, "loss": 1.2157, "step": 2510, "task_loss": 0.695220947265625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.329128742218018, "epoch": 6.32, "learning_rate": 6.660818713450292e-05, "loss": 1.1449, "step": 2520, "task_loss": 0.6765670776367188 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.950167179107666, "epoch": 6.34, "learning_rate": 6.63157894736842e-05, "loss": 1.3043, "step": 2530, "task_loss": 0.5440254211425781 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.431341171264648, "epoch": 6.37, "learning_rate": 6.602339181286549e-05, "loss": 1.082, "step": 2540, "task_loss": 0.7024917602539062 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.697885990142822, "epoch": 6.39, "learning_rate": 6.573099415204677e-05, "loss": 1.0729, "step": 2550, "task_loss": 0.5230712890625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.907511234283447, "epoch": 6.42, "learning_rate": 6.543859649122807e-05, "loss": 1.0617, "step": 2560, "task_loss": 0.42108154296875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.777693748474121, "epoch": 6.44, "learning_rate": 6.514619883040935e-05, "loss": 1.0277, "step": 2570, "task_loss": 0.4836235046386719 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.154238224029541, "epoch": 6.47, "learning_rate": 6.485380116959064e-05, "loss": 1.0124, "step": 2580, "task_loss": 0.4884605407714844 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.249798774719238, "epoch": 6.49, "learning_rate": 6.456140350877192e-05, "loss": 1.0007, "step": 2590, "task_loss": 0.5590858459472656 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.995944976806641, "epoch": 6.52, "learning_rate": 6.426900584795322e-05, "loss": 0.9755, "step": 2600, "task_loss": 0.5008811950683594 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.318019866943359, "epoch": 6.54, "learning_rate": 6.39766081871345e-05, "loss": 1.0208, "step": 2610, "task_loss": 0.5252456665039062 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.776905059814453, "epoch": 6.57, "learning_rate": 6.368421052631579e-05, "loss": 0.9671, "step": 2620, "task_loss": 0.3272743225097656 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.987513542175293, "epoch": 6.59, "learning_rate": 6.339181286549707e-05, "loss": 1.053, "step": 2630, "task_loss": 0.519989013671875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.926828384399414, "epoch": 6.62, "learning_rate": 6.309941520467835e-05, "loss": 1.0345, "step": 2640, "task_loss": 0.7057571411132812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.94542121887207, "epoch": 6.64, "learning_rate": 6.280701754385965e-05, "loss": 0.938, "step": 2650, "task_loss": 0.3907585144042969 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.4620361328125, "epoch": 6.67, "learning_rate": 6.251461988304093e-05, "loss": 0.9972, "step": 2660, "task_loss": 0.5462303161621094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.011201858520508, "epoch": 6.69, "learning_rate": 6.222222222222222e-05, "loss": 0.9856, "step": 2670, "task_loss": 0.4324302673339844 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.679705619812012, "epoch": 6.72, "learning_rate": 6.19298245614035e-05, "loss": 0.9679, "step": 2680, "task_loss": 0.389556884765625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.447802543640137, "epoch": 6.74, "learning_rate": 6.163742690058478e-05, "loss": 0.9855, "step": 2690, "task_loss": 0.74847412109375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.8404693603515625, "epoch": 6.77, "learning_rate": 6.134502923976607e-05, "loss": 0.9994, "step": 2700, "task_loss": 0.3144569396972656 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.578929424285889, "epoch": 6.79, "learning_rate": 6.105263157894736e-05, "loss": 0.9882, "step": 2710, "task_loss": 0.34210968017578125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.381324291229248, "epoch": 6.82, "learning_rate": 6.076023391812865e-05, "loss": 0.9512, "step": 2720, "task_loss": 0.75970458984375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.710232734680176, "epoch": 6.84, "learning_rate": 6.046783625730993e-05, "loss": 0.8841, "step": 2730, "task_loss": 0.4958953857421875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.758987903594971, "epoch": 6.87, "learning_rate": 6.0175438596491224e-05, "loss": 0.9082, "step": 2740, "task_loss": 0.44652748107910156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.766467571258545, "epoch": 6.89, "learning_rate": 5.9883040935672504e-05, "loss": 0.8834, "step": 2750, "task_loss": 0.32134056091308594 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.766737937927246, "epoch": 6.92, "learning_rate": 5.95906432748538e-05, "loss": 0.909, "step": 2760, "task_loss": 0.3227996826171875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.27384090423584, "epoch": 6.94, "learning_rate": 5.929824561403508e-05, "loss": 0.9074, "step": 2770, "task_loss": 0.5718650817871094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.613331317901611, "epoch": 6.97, "learning_rate": 5.900584795321637e-05, "loss": 0.898, "step": 2780, "task_loss": 0.3683891296386719 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.084227561950684, "epoch": 6.99, "learning_rate": 5.871345029239765e-05, "loss": 0.8911, "step": 2790, "task_loss": 0.5007247924804688 }, { "epoch": 7.0, "eval_accuracy": 0.9499852897911151, "eval_loss": 0.7333924174308777, "eval_runtime": 31.4508, "eval_samples_per_second": 216.147, "eval_steps_per_second": 3.402, "step": 2793 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.306143283843994, "epoch": 7.02, "learning_rate": 5.842105263157894e-05, "loss": 0.9263, "step": 2800, "task_loss": 0.35605812072753906 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.722840309143066, "epoch": 7.04, "learning_rate": 5.812865497076023e-05, "loss": 0.8924, "step": 2810, "task_loss": 0.42547607421875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.779755592346191, "epoch": 7.07, "learning_rate": 5.7836257309941515e-05, "loss": 0.9184, "step": 2820, "task_loss": 0.3804302215576172 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.981483459472656, "epoch": 7.09, "learning_rate": 5.7543859649122795e-05, "loss": 0.8944, "step": 2830, "task_loss": 0.5712451934814453 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.986885070800781, "epoch": 7.12, "learning_rate": 5.725146198830409e-05, "loss": 0.8474, "step": 2840, "task_loss": 0.4367961883544922 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.377161026000977, "epoch": 7.14, "learning_rate": 5.695906432748537e-05, "loss": 0.8852, "step": 2850, "task_loss": 0.44561195373535156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.972238540649414, "epoch": 7.17, "learning_rate": 5.6666666666666664e-05, "loss": 0.9092, "step": 2860, "task_loss": 0.6264076232910156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.316938400268555, "epoch": 7.19, "learning_rate": 5.6374269005847944e-05, "loss": 0.8129, "step": 2870, "task_loss": 0.3249168395996094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.2883501052856445, "epoch": 7.22, "learning_rate": 5.608187134502924e-05, "loss": 0.8543, "step": 2880, "task_loss": 0.4031486511230469 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.069027900695801, "epoch": 7.24, "learning_rate": 5.578947368421052e-05, "loss": 0.8383, "step": 2890, "task_loss": 0.6104450225830078 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.340886116027832, "epoch": 7.27, "learning_rate": 5.5497076023391806e-05, "loss": 0.8481, "step": 2900, "task_loss": 0.4594745635986328 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.072140693664551, "epoch": 7.29, "learning_rate": 5.5204678362573093e-05, "loss": 0.8896, "step": 2910, "task_loss": 0.47344017028808594 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.216567516326904, "epoch": 7.32, "learning_rate": 5.491228070175438e-05, "loss": 0.882, "step": 2920, "task_loss": 0.36626625061035156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.011429786682129, "epoch": 7.34, "learning_rate": 5.461988304093567e-05, "loss": 0.8725, "step": 2930, "task_loss": 0.6344451904296875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.870144844055176, "epoch": 7.37, "learning_rate": 5.4327485380116955e-05, "loss": 0.863, "step": 2940, "task_loss": 0.7333259582519531 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.823521137237549, "epoch": 7.39, "learning_rate": 5.4035087719298236e-05, "loss": 0.8444, "step": 2950, "task_loss": 0.46866416931152344 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.694486141204834, "epoch": 7.42, "learning_rate": 5.374269005847953e-05, "loss": 0.8918, "step": 2960, "task_loss": 0.4683876037597656 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.904254913330078, "epoch": 7.44, "learning_rate": 5.345029239766081e-05, "loss": 0.9154, "step": 2970, "task_loss": 0.2806873321533203 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.725863456726074, "epoch": 7.47, "learning_rate": 5.3157894736842104e-05, "loss": 0.8564, "step": 2980, "task_loss": 0.32525062561035156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.622794151306152, "epoch": 7.49, "learning_rate": 5.2865497076023385e-05, "loss": 0.7956, "step": 2990, "task_loss": 0.34836578369140625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.336671829223633, "epoch": 7.52, "learning_rate": 5.257309941520467e-05, "loss": 0.838, "step": 3000, "task_loss": 0.38922119140625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.325677871704102, "epoch": 7.54, "learning_rate": 5.228070175438596e-05, "loss": 0.8417, "step": 3010, "task_loss": 0.17560958862304688 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.489519119262695, "epoch": 7.57, "learning_rate": 5.1988304093567246e-05, "loss": 0.8149, "step": 3020, "task_loss": 0.27367401123046875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.944852828979492, "epoch": 7.59, "learning_rate": 5.1695906432748534e-05, "loss": 0.8294, "step": 3030, "task_loss": 0.4350738525390625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.888934135437012, "epoch": 7.62, "learning_rate": 5.140350877192982e-05, "loss": 0.8402, "step": 3040, "task_loss": 0.5326957702636719 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.191391944885254, "epoch": 7.64, "learning_rate": 5.11111111111111e-05, "loss": 0.8887, "step": 3050, "task_loss": 0.48697662353515625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.0092315673828125, "epoch": 7.67, "learning_rate": 5.0818713450292395e-05, "loss": 0.8341, "step": 3060, "task_loss": 0.44212913513183594 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.650253772735596, "epoch": 7.69, "learning_rate": 5.0526315789473676e-05, "loss": 0.8446, "step": 3070, "task_loss": 0.4129371643066406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.406930923461914, "epoch": 7.72, "learning_rate": 5.023391812865497e-05, "loss": 0.8651, "step": 3080, "task_loss": 0.32791900634765625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.466087341308594, "epoch": 7.74, "learning_rate": 4.994152046783625e-05, "loss": 0.8344, "step": 3090, "task_loss": 0.2927818298339844 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.629019737243652, "epoch": 7.77, "learning_rate": 4.964912280701754e-05, "loss": 0.836, "step": 3100, "task_loss": 0.3524494171142578 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.530261039733887, "epoch": 7.79, "learning_rate": 4.9356725146198825e-05, "loss": 0.8293, "step": 3110, "task_loss": 0.22118759155273438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.515254497528076, "epoch": 7.82, "learning_rate": 4.906432748538011e-05, "loss": 0.826, "step": 3120, "task_loss": 0.2852344512939453 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.249804496765137, "epoch": 7.84, "learning_rate": 4.87719298245614e-05, "loss": 0.8362, "step": 3130, "task_loss": 0.4818267822265625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.5110697746276855, "epoch": 7.87, "learning_rate": 4.8479532163742687e-05, "loss": 0.8203, "step": 3140, "task_loss": 0.3102607727050781 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.204464912414551, "epoch": 7.89, "learning_rate": 4.818713450292397e-05, "loss": 0.8568, "step": 3150, "task_loss": 0.5065097808837891 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.663612365722656, "epoch": 7.92, "learning_rate": 4.789473684210526e-05, "loss": 0.7996, "step": 3160, "task_loss": 0.39002227783203125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.3519182205200195, "epoch": 7.94, "learning_rate": 4.760233918128654e-05, "loss": 0.8175, "step": 3170, "task_loss": 0.3212089538574219 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.867648124694824, "epoch": 7.97, "learning_rate": 4.7309941520467836e-05, "loss": 0.8324, "step": 3180, "task_loss": 0.5879039764404297 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.661693096160889, "epoch": 7.99, "learning_rate": 4.7017543859649116e-05, "loss": 0.8965, "step": 3190, "task_loss": 0.35546112060546875 }, { "epoch": 8.0, "eval_accuracy": 0.9685201529861724, "eval_loss": 0.6553402543067932, "eval_runtime": 32.0734, "eval_samples_per_second": 211.952, "eval_steps_per_second": 3.336, "step": 3192 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.4450273513793945, "epoch": 8.02, "learning_rate": 4.67251461988304e-05, "loss": 0.8169, "step": 3200, "task_loss": 0.3982429504394531 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.912228584289551, "epoch": 8.05, "learning_rate": 4.643274853801169e-05, "loss": 0.7803, "step": 3210, "task_loss": 0.46903228759765625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.589568138122559, "epoch": 8.07, "learning_rate": 4.614035087719298e-05, "loss": 0.7964, "step": 3220, "task_loss": 0.4210052490234375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.485812187194824, "epoch": 8.1, "learning_rate": 4.5847953216374265e-05, "loss": 0.8213, "step": 3230, "task_loss": 0.33795166015625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.902305603027344, "epoch": 8.12, "learning_rate": 4.555555555555555e-05, "loss": 0.8195, "step": 3240, "task_loss": 0.4801959991455078 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.018232345581055, "epoch": 8.15, "learning_rate": 4.529239766081871e-05, "loss": 0.8123, "step": 3250, "task_loss": 0.2711772918701172 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.282988548278809, "epoch": 8.17, "learning_rate": 4.4999999999999996e-05, "loss": 0.786, "step": 3260, "task_loss": 0.3115501403808594 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.417398452758789, "epoch": 8.2, "learning_rate": 4.470760233918128e-05, "loss": 0.8198, "step": 3270, "task_loss": 0.41504669189453125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.6695451736450195, "epoch": 8.22, "learning_rate": 4.441520467836257e-05, "loss": 0.8408, "step": 3280, "task_loss": 0.3230552673339844 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.235175132751465, "epoch": 8.25, "learning_rate": 4.412280701754386e-05, "loss": 0.7835, "step": 3290, "task_loss": 0.438812255859375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.694821834564209, "epoch": 8.27, "learning_rate": 4.3830409356725145e-05, "loss": 0.8003, "step": 3300, "task_loss": 0.5104885101318359 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.135303974151611, "epoch": 8.3, "learning_rate": 4.3538011695906426e-05, "loss": 0.8443, "step": 3310, "task_loss": 0.4680938720703125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.147139072418213, "epoch": 8.32, "learning_rate": 4.324561403508772e-05, "loss": 0.8186, "step": 3320, "task_loss": 0.282958984375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.794891834259033, "epoch": 8.35, "learning_rate": 4.2953216374269e-05, "loss": 0.7849, "step": 3330, "task_loss": 0.34975242614746094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.637408256530762, "epoch": 8.37, "learning_rate": 4.2660818713450294e-05, "loss": 0.8, "step": 3340, "task_loss": 0.27742767333984375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.426722526550293, "epoch": 8.4, "learning_rate": 4.2368421052631575e-05, "loss": 0.7673, "step": 3350, "task_loss": 0.3129863739013672 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.3653244972229, "epoch": 8.42, "learning_rate": 4.207602339181287e-05, "loss": 0.7526, "step": 3360, "task_loss": 0.30819129943847656 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.321964263916016, "epoch": 8.45, "learning_rate": 4.178362573099415e-05, "loss": 0.7529, "step": 3370, "task_loss": 0.19260787963867188 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.544368743896484, "epoch": 8.47, "learning_rate": 4.1491228070175436e-05, "loss": 0.7815, "step": 3380, "task_loss": 0.348663330078125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.614175796508789, "epoch": 8.5, "learning_rate": 4.1198830409356724e-05, "loss": 0.7548, "step": 3390, "task_loss": 0.23873519897460938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.2296342849731445, "epoch": 8.52, "learning_rate": 4.090643274853801e-05, "loss": 0.7977, "step": 3400, "task_loss": 0.1718120574951172 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.963824987411499, "epoch": 8.55, "learning_rate": 4.061403508771929e-05, "loss": 0.8097, "step": 3410, "task_loss": 0.6822185516357422 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.4087910652160645, "epoch": 8.57, "learning_rate": 4.0321637426900585e-05, "loss": 0.8424, "step": 3420, "task_loss": 0.3879871368408203 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.150110244750977, "epoch": 8.6, "learning_rate": 4.0029239766081866e-05, "loss": 0.7919, "step": 3430, "task_loss": 0.22334671020507812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.690118789672852, "epoch": 8.62, "learning_rate": 3.973684210526316e-05, "loss": 0.8435, "step": 3440, "task_loss": 0.2780895233154297 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.837807655334473, "epoch": 8.65, "learning_rate": 3.944444444444444e-05, "loss": 0.8215, "step": 3450, "task_loss": 0.5651683807373047 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.85980749130249, "epoch": 8.67, "learning_rate": 3.9152046783625734e-05, "loss": 0.794, "step": 3460, "task_loss": 0.4184246063232422 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.422865867614746, "epoch": 8.7, "learning_rate": 3.8859649122807015e-05, "loss": 0.8066, "step": 3470, "task_loss": 0.31614112854003906 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.89119291305542, "epoch": 8.72, "learning_rate": 3.85672514619883e-05, "loss": 0.7719, "step": 3480, "task_loss": 0.27257537841796875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.10051965713501, "epoch": 8.75, "learning_rate": 3.827485380116959e-05, "loss": 0.7389, "step": 3490, "task_loss": 0.5138072967529297 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.613463878631592, "epoch": 8.77, "learning_rate": 3.7982456140350876e-05, "loss": 0.7425, "step": 3500, "task_loss": 0.25890350341796875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.8474650382995605, "epoch": 8.8, "learning_rate": 3.769005847953216e-05, "loss": 0.7373, "step": 3510, "task_loss": 0.14526081085205078 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.127475738525391, "epoch": 8.82, "learning_rate": 3.739766081871345e-05, "loss": 0.8208, "step": 3520, "task_loss": 0.6845426559448242 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.054192543029785, "epoch": 8.85, "learning_rate": 3.710526315789473e-05, "loss": 0.7686, "step": 3530, "task_loss": 0.3179588317871094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.050369739532471, "epoch": 8.87, "learning_rate": 3.6812865497076025e-05, "loss": 0.8039, "step": 3540, "task_loss": 0.2579622268676758 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.5263357162475586, "epoch": 8.9, "learning_rate": 3.6520467836257306e-05, "loss": 0.8152, "step": 3550, "task_loss": 0.2673015594482422 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.833356857299805, "epoch": 8.92, "learning_rate": 3.62280701754386e-05, "loss": 0.7765, "step": 3560, "task_loss": 0.37337303161621094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.146718978881836, "epoch": 8.95, "learning_rate": 3.593567251461988e-05, "loss": 0.7824, "step": 3570, "task_loss": 0.32893943786621094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.586151123046875, "epoch": 8.97, "learning_rate": 3.564327485380117e-05, "loss": 0.7662, "step": 3580, "task_loss": 0.4216928482055664 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.9081151485443115, "epoch": 9.0, "learning_rate": 3.5350877192982455e-05, "loss": 0.7198, "step": 3590, "task_loss": 0.3142738342285156 }, { "epoch": 9.0, "eval_accuracy": 0.9669020300088261, "eval_loss": 0.6212905049324036, "eval_runtime": 32.135, "eval_samples_per_second": 211.545, "eval_steps_per_second": 3.33, "step": 3591 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.315699577331543, "epoch": 9.02, "learning_rate": 3.505847953216374e-05, "loss": 0.7693, "step": 3600, "task_loss": 0.23903274536132812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.7046122550964355, "epoch": 9.05, "learning_rate": 3.476608187134503e-05, "loss": 0.7464, "step": 3610, "task_loss": 0.23265552520751953 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.232175827026367, "epoch": 9.07, "learning_rate": 3.447368421052631e-05, "loss": 0.7755, "step": 3620, "task_loss": 0.4364604949951172 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.088405609130859, "epoch": 9.1, "learning_rate": 3.41812865497076e-05, "loss": 0.7585, "step": 3630, "task_loss": 0.23165130615234375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.075429916381836, "epoch": 9.12, "learning_rate": 3.3888888888888884e-05, "loss": 0.7904, "step": 3640, "task_loss": 0.5744905471801758 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 5.020939350128174, "epoch": 9.15, "learning_rate": 3.359649122807017e-05, "loss": 0.7415, "step": 3650, "task_loss": 0.4139251708984375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.201698303222656, "epoch": 9.17, "learning_rate": 3.330409356725146e-05, "loss": 0.7596, "step": 3660, "task_loss": 0.36458587646484375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.9503705501556396, "epoch": 9.2, "learning_rate": 3.3011695906432746e-05, "loss": 0.7421, "step": 3670, "task_loss": 0.2739439010620117 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.7443084716796875, "epoch": 9.22, "learning_rate": 3.2719298245614033e-05, "loss": 0.7744, "step": 3680, "task_loss": 0.47455596923828125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.245865821838379, "epoch": 9.25, "learning_rate": 3.242690058479532e-05, "loss": 0.7339, "step": 3690, "task_loss": 0.3756685256958008 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.305609703063965, "epoch": 9.27, "learning_rate": 3.213450292397661e-05, "loss": 0.7735, "step": 3700, "task_loss": 0.324432373046875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.544186592102051, "epoch": 9.3, "learning_rate": 3.1842105263157895e-05, "loss": 0.7711, "step": 3710, "task_loss": 0.4451179504394531 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.720974922180176, "epoch": 9.32, "learning_rate": 3.1549707602339176e-05, "loss": 0.7435, "step": 3720, "task_loss": 0.6814441680908203 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.221366882324219, "epoch": 9.35, "learning_rate": 3.125730994152046e-05, "loss": 0.6939, "step": 3730, "task_loss": 0.28525352478027344 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.022902965545654, "epoch": 9.37, "learning_rate": 3.096491228070175e-05, "loss": 0.7365, "step": 3740, "task_loss": 0.15752792358398438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.552103519439697, "epoch": 9.4, "learning_rate": 3.067251461988304e-05, "loss": 0.7366, "step": 3750, "task_loss": 0.36133861541748047 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.75297474861145, "epoch": 9.42, "learning_rate": 3.0380116959064325e-05, "loss": 0.7271, "step": 3760, "task_loss": 0.16461563110351562 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.081660270690918, "epoch": 9.45, "learning_rate": 3.0087719298245612e-05, "loss": 0.7333, "step": 3770, "task_loss": 0.2214345932006836 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.590606689453125, "epoch": 9.47, "learning_rate": 2.97953216374269e-05, "loss": 0.7534, "step": 3780, "task_loss": 0.4469118118286133 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.9362525939941406, "epoch": 9.5, "learning_rate": 2.9502923976608186e-05, "loss": 0.733, "step": 3790, "task_loss": 0.22396183013916016 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.9870333671569824, "epoch": 9.52, "learning_rate": 2.921052631578947e-05, "loss": 0.7409, "step": 3800, "task_loss": 0.3021268844604492 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.993597984313965, "epoch": 9.55, "learning_rate": 2.8918128654970757e-05, "loss": 0.7229, "step": 3810, "task_loss": 0.1641530990600586 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.234393119812012, "epoch": 9.57, "learning_rate": 2.8625730994152045e-05, "loss": 0.7313, "step": 3820, "task_loss": 0.22105979919433594 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.075281143188477, "epoch": 9.6, "learning_rate": 2.8333333333333332e-05, "loss": 0.7686, "step": 3830, "task_loss": 0.27436161041259766 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.471318244934082, "epoch": 9.62, "learning_rate": 2.804093567251462e-05, "loss": 0.7691, "step": 3840, "task_loss": 0.7720394134521484 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.824963569641113, "epoch": 9.65, "learning_rate": 2.7748538011695903e-05, "loss": 0.745, "step": 3850, "task_loss": 0.3804445266723633 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.057116508483887, "epoch": 9.67, "learning_rate": 2.745614035087719e-05, "loss": 0.7789, "step": 3860, "task_loss": 0.3109712600708008 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.6748297214508057, "epoch": 9.7, "learning_rate": 2.7163742690058478e-05, "loss": 0.7308, "step": 3870, "task_loss": 0.26787662506103516 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.097855567932129, "epoch": 9.72, "learning_rate": 2.6871345029239765e-05, "loss": 0.7423, "step": 3880, "task_loss": 0.4132719039916992 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.6987948417663574, "epoch": 9.75, "learning_rate": 2.6578947368421052e-05, "loss": 0.7511, "step": 3890, "task_loss": 0.3288288116455078 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.6531050205230713, "epoch": 9.77, "learning_rate": 2.6286549707602336e-05, "loss": 0.7149, "step": 3900, "task_loss": 0.23686599731445312 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.270119667053223, "epoch": 9.8, "learning_rate": 2.5994152046783623e-05, "loss": 0.7163, "step": 3910, "task_loss": 0.5312776565551758 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.358315467834473, "epoch": 9.82, "learning_rate": 2.570175438596491e-05, "loss": 0.7294, "step": 3920, "task_loss": 0.2489185333251953 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.781044006347656, "epoch": 9.85, "learning_rate": 2.5409356725146198e-05, "loss": 0.751, "step": 3930, "task_loss": 0.6150417327880859 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.537405014038086, "epoch": 9.87, "learning_rate": 2.5116959064327485e-05, "loss": 0.7216, "step": 3940, "task_loss": 0.27581310272216797 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.4504852294921875, "epoch": 9.9, "learning_rate": 2.482456140350877e-05, "loss": 0.7078, "step": 3950, "task_loss": 0.27785396575927734 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.741490364074707, "epoch": 9.92, "learning_rate": 2.4532163742690056e-05, "loss": 0.7143, "step": 3960, "task_loss": 0.2519559860229492 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.306387901306152, "epoch": 9.95, "learning_rate": 2.4239766081871343e-05, "loss": 0.722, "step": 3970, "task_loss": 0.4006071090698242 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.9580562114715576, "epoch": 9.97, "learning_rate": 2.394736842105263e-05, "loss": 0.7629, "step": 3980, "task_loss": 0.30501747131347656 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.427006721496582, "epoch": 10.0, "learning_rate": 2.3654970760233918e-05, "loss": 0.7372, "step": 3990, "task_loss": 0.2783851623535156 }, { "epoch": 10.0, "eval_accuracy": 0.9674904383642248, "eval_loss": 0.5929271578788757, "eval_runtime": 32.1185, "eval_samples_per_second": 211.654, "eval_steps_per_second": 3.331, "step": 3990 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.8730010986328125, "epoch": 10.03, "learning_rate": 2.33625730994152e-05, "loss": 0.7364, "step": 4000, "task_loss": 0.2448558807373047 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.606790542602539, "epoch": 10.05, "learning_rate": 2.307017543859649e-05, "loss": 0.7255, "step": 4010, "task_loss": 0.37810707092285156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.424098014831543, "epoch": 10.08, "learning_rate": 2.2777777777777776e-05, "loss": 0.751, "step": 4020, "task_loss": 0.49766063690185547 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.420840263366699, "epoch": 10.1, "learning_rate": 2.2485380116959063e-05, "loss": 0.7456, "step": 4030, "task_loss": 0.688593864440918 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.095724105834961, "epoch": 10.13, "learning_rate": 2.219298245614035e-05, "loss": 0.6804, "step": 4040, "task_loss": 0.28264522552490234 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.558351516723633, "epoch": 10.15, "learning_rate": 2.1900584795321638e-05, "loss": 0.7459, "step": 4050, "task_loss": 0.26221275329589844 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.902848243713379, "epoch": 10.18, "learning_rate": 2.1608187134502922e-05, "loss": 0.7224, "step": 4060, "task_loss": 0.5789623260498047 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.917043924331665, "epoch": 10.2, "learning_rate": 2.131578947368421e-05, "loss": 0.7054, "step": 4070, "task_loss": 0.27019596099853516 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.579798698425293, "epoch": 10.23, "learning_rate": 2.1023391812865496e-05, "loss": 0.6975, "step": 4080, "task_loss": 0.4274778366088867 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.8784656524658203, "epoch": 10.25, "learning_rate": 2.0730994152046784e-05, "loss": 0.6941, "step": 4090, "task_loss": 0.40457916259765625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.8084497451782227, "epoch": 10.28, "learning_rate": 2.043859649122807e-05, "loss": 0.6909, "step": 4100, "task_loss": 0.3487281799316406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.9689548015594482, "epoch": 10.3, "learning_rate": 2.0146198830409355e-05, "loss": 0.6787, "step": 4110, "task_loss": 0.24568462371826172 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.155590057373047, "epoch": 10.33, "learning_rate": 1.9853801169590642e-05, "loss": 0.6842, "step": 4120, "task_loss": 0.34735965728759766 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.2108330726623535, "epoch": 10.35, "learning_rate": 1.956140350877193e-05, "loss": 0.7041, "step": 4130, "task_loss": 0.2593955993652344 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.016105651855469, "epoch": 10.38, "learning_rate": 1.9269005847953216e-05, "loss": 0.6836, "step": 4140, "task_loss": 0.24969100952148438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.97172212600708, "epoch": 10.4, "learning_rate": 1.8976608187134504e-05, "loss": 0.6628, "step": 4150, "task_loss": 0.15788745880126953 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.238569259643555, "epoch": 10.43, "learning_rate": 1.8684210526315787e-05, "loss": 0.6634, "step": 4160, "task_loss": 0.18412494659423828 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.165921211242676, "epoch": 10.45, "learning_rate": 1.8391812865497075e-05, "loss": 0.6791, "step": 4170, "task_loss": 0.1882610321044922 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.925244092941284, "epoch": 10.48, "learning_rate": 1.8099415204678362e-05, "loss": 0.7311, "step": 4180, "task_loss": 0.355316162109375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.39293098449707, "epoch": 10.5, "learning_rate": 1.780701754385965e-05, "loss": 0.6684, "step": 4190, "task_loss": 0.17953014373779297 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.603082180023193, "epoch": 10.53, "learning_rate": 1.7514619883040936e-05, "loss": 0.715, "step": 4200, "task_loss": 0.3889150619506836 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.6784658432006836, "epoch": 10.55, "learning_rate": 1.722222222222222e-05, "loss": 0.666, "step": 4210, "task_loss": 0.1250457763671875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.187887191772461, "epoch": 10.58, "learning_rate": 1.6929824561403508e-05, "loss": 0.7071, "step": 4220, "task_loss": 0.22658157348632812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.710993766784668, "epoch": 10.6, "learning_rate": 1.663742690058479e-05, "loss": 0.647, "step": 4230, "task_loss": 0.4486532211303711 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.261455535888672, "epoch": 10.63, "learning_rate": 1.634502923976608e-05, "loss": 0.7054, "step": 4240, "task_loss": 0.28383445739746094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.049722671508789, "epoch": 10.65, "learning_rate": 1.6052631578947366e-05, "loss": 0.6497, "step": 4250, "task_loss": 0.19504547119140625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.419764995574951, "epoch": 10.68, "learning_rate": 1.5760233918128653e-05, "loss": 0.6706, "step": 4260, "task_loss": 0.1890125274658203 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.287445068359375, "epoch": 10.7, "learning_rate": 1.546783625730994e-05, "loss": 0.6961, "step": 4270, "task_loss": 0.3999347686767578 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.919074535369873, "epoch": 10.73, "learning_rate": 1.5175438596491226e-05, "loss": 0.6553, "step": 4280, "task_loss": 0.16851806640625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.20687198638916, "epoch": 10.75, "learning_rate": 1.4883040935672513e-05, "loss": 0.6934, "step": 4290, "task_loss": 0.18524169921875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.6193199157714844, "epoch": 10.78, "learning_rate": 1.4590643274853799e-05, "loss": 0.7152, "step": 4300, "task_loss": 0.3484325408935547 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.201796054840088, "epoch": 10.8, "learning_rate": 1.4298245614035086e-05, "loss": 0.6658, "step": 4310, "task_loss": 0.14459705352783203 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.3845696449279785, "epoch": 10.83, "learning_rate": 1.4005847953216372e-05, "loss": 0.6703, "step": 4320, "task_loss": 0.3483409881591797 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.77663516998291, "epoch": 10.85, "learning_rate": 1.3713450292397659e-05, "loss": 0.6738, "step": 4330, "task_loss": 0.3550996780395508 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.02517032623291, "epoch": 10.88, "learning_rate": 1.3421052631578946e-05, "loss": 0.6886, "step": 4340, "task_loss": 0.2996025085449219 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.099001884460449, "epoch": 10.9, "learning_rate": 1.3128654970760232e-05, "loss": 0.6527, "step": 4350, "task_loss": 0.2748298645019531 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.5707130432128906, "epoch": 10.93, "learning_rate": 1.2836257309941519e-05, "loss": 0.6751, "step": 4360, "task_loss": 0.1947002410888672 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.698594570159912, "epoch": 10.95, "learning_rate": 1.2543859649122804e-05, "loss": 0.6821, "step": 4370, "task_loss": 0.22928142547607422 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.335412979125977, "epoch": 10.98, "learning_rate": 1.2251461988304092e-05, "loss": 0.7004, "step": 4380, "task_loss": 0.1544780731201172 }, { "epoch": 11.0, "eval_accuracy": 0.9720506031185643, "eval_loss": 0.5719800591468811, "eval_runtime": 31.8788, "eval_samples_per_second": 213.245, "eval_steps_per_second": 3.356, "step": 4389 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.011960029602051, "epoch": 11.0, "learning_rate": 1.1959064327485379e-05, "loss": 0.6869, "step": 4390, "task_loss": 0.4317331314086914 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.781693696975708, "epoch": 11.03, "learning_rate": 1.1666666666666665e-05, "loss": 0.6764, "step": 4400, "task_loss": 0.19684505462646484 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.8459486961364746, "epoch": 11.05, "learning_rate": 1.1374269005847952e-05, "loss": 0.6777, "step": 4410, "task_loss": 0.19100379943847656 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.6235389709472656, "epoch": 11.08, "learning_rate": 1.1081871345029239e-05, "loss": 0.7001, "step": 4420, "task_loss": 0.13420391082763672 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.281004905700684, "epoch": 11.1, "learning_rate": 1.0789473684210525e-05, "loss": 0.6775, "step": 4430, "task_loss": 0.18527984619140625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.187185287475586, "epoch": 11.13, "learning_rate": 1.0497076023391812e-05, "loss": 0.6461, "step": 4440, "task_loss": 0.1463918685913086 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.547395706176758, "epoch": 11.15, "learning_rate": 1.0204678362573097e-05, "loss": 0.65, "step": 4450, "task_loss": 0.15275001525878906 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.184659481048584, "epoch": 11.18, "learning_rate": 9.912280701754385e-06, "loss": 0.6646, "step": 4460, "task_loss": 0.2873668670654297 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.149720191955566, "epoch": 11.2, "learning_rate": 9.619883040935672e-06, "loss": 0.6883, "step": 4470, "task_loss": 0.23235559463500977 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.8264083862304688, "epoch": 11.23, "learning_rate": 9.327485380116957e-06, "loss": 0.6637, "step": 4480, "task_loss": 0.42732906341552734 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.3232622146606445, "epoch": 11.25, "learning_rate": 9.035087719298245e-06, "loss": 0.6962, "step": 4490, "task_loss": 0.2845592498779297 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.556276798248291, "epoch": 11.28, "learning_rate": 8.74269005847953e-06, "loss": 0.6801, "step": 4500, "task_loss": 0.1521158218383789 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.712045192718506, "epoch": 11.3, "learning_rate": 8.450292397660817e-06, "loss": 0.6593, "step": 4510, "task_loss": 0.16354703903198242 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.208836555480957, "epoch": 11.33, "learning_rate": 8.157894736842105e-06, "loss": 0.653, "step": 4520, "task_loss": 0.2885322570800781 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.867947816848755, "epoch": 11.35, "learning_rate": 7.86549707602339e-06, "loss": 0.6689, "step": 4530, "task_loss": 0.15323257446289062 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.030797004699707, "epoch": 11.38, "learning_rate": 7.5730994152046775e-06, "loss": 0.6934, "step": 4540, "task_loss": 0.1717977523803711 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.6297760009765625, "epoch": 11.4, "learning_rate": 7.280701754385964e-06, "loss": 0.6432, "step": 4550, "task_loss": 0.25377655029296875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.9549574851989746, "epoch": 11.43, "learning_rate": 6.98830409356725e-06, "loss": 0.6326, "step": 4560, "task_loss": 0.3191709518432617 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.870671272277832, "epoch": 11.45, "learning_rate": 6.695906432748537e-06, "loss": 0.6473, "step": 4570, "task_loss": 0.22795486450195312 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.74957275390625, "epoch": 11.48, "learning_rate": 6.403508771929824e-06, "loss": 0.6605, "step": 4580, "task_loss": 0.3093724250793457 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.1387176513671875, "epoch": 11.5, "learning_rate": 6.11111111111111e-06, "loss": 0.6832, "step": 4590, "task_loss": 0.17415571212768555 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.582918167114258, "epoch": 11.53, "learning_rate": 5.818713450292397e-06, "loss": 0.6548, "step": 4600, "task_loss": 0.27022218704223633 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.299407005310059, "epoch": 11.55, "learning_rate": 5.526315789473683e-06, "loss": 0.6569, "step": 4610, "task_loss": 0.18466615676879883 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.10187292098999, "epoch": 11.58, "learning_rate": 5.23391812865497e-06, "loss": 0.6736, "step": 4620, "task_loss": 0.22667455673217773 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.158428192138672, "epoch": 11.6, "learning_rate": 4.941520467836257e-06, "loss": 0.6568, "step": 4630, "task_loss": 0.37564754486083984 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.290033340454102, "epoch": 11.63, "learning_rate": 4.649122807017543e-06, "loss": 0.6461, "step": 4640, "task_loss": 0.23346328735351562 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.264878273010254, "epoch": 11.65, "learning_rate": 4.35672514619883e-06, "loss": 0.6535, "step": 4650, "task_loss": 0.5122127532958984 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.0440850257873535, "epoch": 11.68, "learning_rate": 4.064327485380116e-06, "loss": 0.6745, "step": 4660, "task_loss": 0.26698923110961914 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.828075408935547, "epoch": 11.7, "learning_rate": 3.771929824561403e-06, "loss": 0.6448, "step": 4670, "task_loss": 0.17563343048095703 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.513957500457764, "epoch": 11.73, "learning_rate": 3.4795321637426897e-06, "loss": 0.6578, "step": 4680, "task_loss": 0.30327558517456055 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.048562049865723, "epoch": 11.75, "learning_rate": 3.187134502923976e-06, "loss": 0.6632, "step": 4690, "task_loss": 0.3211050033569336 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.672558784484863, "epoch": 11.78, "learning_rate": 2.894736842105263e-06, "loss": 0.6627, "step": 4700, "task_loss": 0.36839962005615234 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.135615348815918, "epoch": 11.8, "learning_rate": 2.6023391812865493e-06, "loss": 0.6547, "step": 4710, "task_loss": 0.1994314193725586 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.733199119567871, "epoch": 11.83, "learning_rate": 2.3099415204678357e-06, "loss": 0.6325, "step": 4720, "task_loss": 0.2401747703552246 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.437094211578369, "epoch": 11.85, "learning_rate": 2.0175438596491226e-06, "loss": 0.6438, "step": 4730, "task_loss": 0.20534133911132812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.671337366104126, "epoch": 11.88, "learning_rate": 1.7251461988304092e-06, "loss": 0.6403, "step": 4740, "task_loss": 0.2982659339904785 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.278988838195801, "epoch": 11.9, "learning_rate": 1.4327485380116958e-06, "loss": 0.6612, "step": 4750, "task_loss": 0.28882884979248047 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.320833683013916, "epoch": 11.93, "learning_rate": 1.1403508771929824e-06, "loss": 0.6532, "step": 4760, "task_loss": 0.49799346923828125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 4.11929988861084, "epoch": 11.95, "learning_rate": 8.479532163742689e-07, "loss": 0.643, "step": 4770, "task_loss": 0.34448814392089844 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04, "compression/movement_sparsity/importance_threshold": 0.0, "compression/movement_sparsity/linear_layer_sparsity": 0.5806125908047275, "compression/movement_sparsity/model_sparsity": 0.5219564613754268, "compression_loss": 0.0, "distillation_loss": 3.9442811012268066, "epoch": 11.98, "learning_rate": 5.555555555555555e-07, "loss": 0.6195, "step": 4780, "task_loss": 0.22199058532714844 }, { "epoch": 12.0, "eval_accuracy": 0.9755810532509561, "eval_loss": 0.5631720423698425, "eval_runtime": 31.8949, "eval_samples_per_second": 213.138, "eval_steps_per_second": 3.355, "step": 4788 }, { "epoch": 12.0, "step": 4788, "total_flos": 5.579752612756608e+18, "train_loss": 4.577942104168304, "train_runtime": 8786.4706, "train_samples_per_second": 69.781, "train_steps_per_second": 0.545 } ], "max_steps": 4788, "num_train_epochs": 12, "total_flos": 5.579752612756608e+18, "trial_name": null, "trial_params": null }