{ "best_metric": 0.395076185464859, "best_model_checkpoint": "mikhail-panzo/ceb_b128_le4_s4000/checkpoint-1500", "epoch": 316.83168316831683, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.9603960396039604, "grad_norm": 2.952680826187134, "learning_rate": 2.5e-06, "loss": 0.7707, "step": 50 }, { "epoch": 7.920792079207921, "grad_norm": 1.126000165939331, "learning_rate": 5e-06, "loss": 0.6934, "step": 100 }, { "epoch": 11.881188118811881, "grad_norm": 2.129329204559326, "learning_rate": 7.5e-06, "loss": 0.5982, "step": 150 }, { "epoch": 15.841584158415841, "grad_norm": 1.1155015230178833, "learning_rate": 1e-05, "loss": 0.5093, "step": 200 }, { "epoch": 19.801980198019802, "grad_norm": 1.3643360137939453, "learning_rate": 1.25e-05, "loss": 0.4912, "step": 250 }, { "epoch": 23.762376237623762, "grad_norm": 0.9774149060249329, "learning_rate": 1.5e-05, "loss": 0.4764, "step": 300 }, { "epoch": 27.722772277227723, "grad_norm": 0.8793525695800781, "learning_rate": 1.75e-05, "loss": 0.4648, "step": 350 }, { "epoch": 31.683168316831683, "grad_norm": 0.7882798910140991, "learning_rate": 2e-05, "loss": 0.459, "step": 400 }, { "epoch": 35.64356435643565, "grad_norm": 1.2033296823501587, "learning_rate": 2.25e-05, "loss": 0.4519, "step": 450 }, { "epoch": 39.603960396039604, "grad_norm": 0.8620955348014832, "learning_rate": 2.5e-05, "loss": 0.4433, "step": 500 }, { "epoch": 39.603960396039604, "eval_loss": 0.4056467115879059, "eval_runtime": 7.4199, "eval_samples_per_second": 24.259, "eval_steps_per_second": 3.1, "step": 500 }, { "epoch": 43.56435643564357, "grad_norm": 1.266213297843933, "learning_rate": 2.7500000000000004e-05, "loss": 0.444, "step": 550 }, { "epoch": 47.524752475247524, "grad_norm": 1.174134373664856, "learning_rate": 3e-05, "loss": 0.437, "step": 600 }, { "epoch": 51.48514851485149, "grad_norm": 0.8188551664352417, "learning_rate": 3.2500000000000004e-05, "loss": 0.4318, "step": 650 }, { "epoch": 55.445544554455445, "grad_norm": 1.3083854913711548, "learning_rate": 3.5e-05, "loss": 0.4297, "step": 700 }, { "epoch": 59.40594059405941, "grad_norm": 0.8094576001167297, "learning_rate": 3.7500000000000003e-05, "loss": 0.4238, "step": 750 }, { "epoch": 63.366336633663366, "grad_norm": 1.2506986856460571, "learning_rate": 4e-05, "loss": 0.4233, "step": 800 }, { "epoch": 67.32673267326733, "grad_norm": 1.7902193069458008, "learning_rate": 4.25e-05, "loss": 0.4208, "step": 850 }, { "epoch": 71.2871287128713, "grad_norm": 2.209643602371216, "learning_rate": 4.5e-05, "loss": 0.4195, "step": 900 }, { "epoch": 75.24752475247524, "grad_norm": 2.256385087966919, "learning_rate": 4.75e-05, "loss": 0.4202, "step": 950 }, { "epoch": 79.20792079207921, "grad_norm": 0.9167351126670837, "learning_rate": 5e-05, "loss": 0.4142, "step": 1000 }, { "epoch": 79.20792079207921, "eval_loss": 0.39529716968536377, "eval_runtime": 6.6198, "eval_samples_per_second": 27.191, "eval_steps_per_second": 3.474, "step": 1000 }, { "epoch": 83.16831683168317, "grad_norm": 0.9256124496459961, "learning_rate": 5.25e-05, "loss": 0.4137, "step": 1050 }, { "epoch": 87.12871287128714, "grad_norm": 1.9655492305755615, "learning_rate": 5.500000000000001e-05, "loss": 0.4132, "step": 1100 }, { "epoch": 91.08910891089108, "grad_norm": 1.08187997341156, "learning_rate": 5.7499999999999995e-05, "loss": 0.4078, "step": 1150 }, { "epoch": 95.04950495049505, "grad_norm": 1.8732513189315796, "learning_rate": 6e-05, "loss": 0.4061, "step": 1200 }, { "epoch": 99.00990099009901, "grad_norm": 1.3401920795440674, "learning_rate": 6.25e-05, "loss": 0.4054, "step": 1250 }, { "epoch": 102.97029702970298, "grad_norm": 0.8879286646842957, "learning_rate": 6.500000000000001e-05, "loss": 0.4046, "step": 1300 }, { "epoch": 106.93069306930693, "grad_norm": 1.2213553190231323, "learning_rate": 6.750000000000001e-05, "loss": 0.3999, "step": 1350 }, { "epoch": 110.89108910891089, "grad_norm": 2.05886173248291, "learning_rate": 7e-05, "loss": 0.3983, "step": 1400 }, { "epoch": 114.85148514851485, "grad_norm": 1.10814368724823, "learning_rate": 7.25e-05, "loss": 0.3964, "step": 1450 }, { "epoch": 118.81188118811882, "grad_norm": 1.0220105648040771, "learning_rate": 7.500000000000001e-05, "loss": 0.3972, "step": 1500 }, { "epoch": 118.81188118811882, "eval_loss": 0.395076185464859, "eval_runtime": 7.1134, "eval_samples_per_second": 25.304, "eval_steps_per_second": 3.233, "step": 1500 }, { "epoch": 122.77227722772277, "grad_norm": 1.1671476364135742, "learning_rate": 7.75e-05, "loss": 0.3909, "step": 1550 }, { "epoch": 126.73267326732673, "grad_norm": 0.8167665600776672, "learning_rate": 8e-05, "loss": 0.3913, "step": 1600 }, { "epoch": 130.69306930693068, "grad_norm": 1.9696727991104126, "learning_rate": 8.25e-05, "loss": 0.3888, "step": 1650 }, { "epoch": 134.65346534653466, "grad_norm": 1.1445192098617554, "learning_rate": 8.5e-05, "loss": 0.3885, "step": 1700 }, { "epoch": 138.6138613861386, "grad_norm": 1.908463954925537, "learning_rate": 8.75e-05, "loss": 0.3888, "step": 1750 }, { "epoch": 142.5742574257426, "grad_norm": 1.6358473300933838, "learning_rate": 9e-05, "loss": 0.3878, "step": 1800 }, { "epoch": 146.53465346534654, "grad_norm": 1.7407046556472778, "learning_rate": 9.250000000000001e-05, "loss": 0.3868, "step": 1850 }, { "epoch": 150.4950495049505, "grad_norm": 3.984604597091675, "learning_rate": 9.5e-05, "loss": 0.3877, "step": 1900 }, { "epoch": 154.45544554455446, "grad_norm": 1.5218504667282104, "learning_rate": 9.75e-05, "loss": 0.3828, "step": 1950 }, { "epoch": 158.41584158415841, "grad_norm": 1.2710033655166626, "learning_rate": 0.0001, "loss": 0.3806, "step": 2000 }, { "epoch": 158.41584158415841, "eval_loss": 0.39701494574546814, "eval_runtime": 7.067, "eval_samples_per_second": 25.47, "eval_steps_per_second": 3.255, "step": 2000 }, { "epoch": 162.37623762376236, "grad_norm": 1.905275583267212, "learning_rate": 9.75e-05, "loss": 0.3817, "step": 2050 }, { "epoch": 166.33663366336634, "grad_norm": 2.0633037090301514, "learning_rate": 9.5e-05, "loss": 0.3805, "step": 2100 }, { "epoch": 170.2970297029703, "grad_norm": 1.110232949256897, "learning_rate": 9.250000000000001e-05, "loss": 0.378, "step": 2150 }, { "epoch": 174.25742574257427, "grad_norm": 2.0630099773406982, "learning_rate": 9e-05, "loss": 0.3742, "step": 2200 }, { "epoch": 178.21782178217822, "grad_norm": 1.2983876466751099, "learning_rate": 8.75e-05, "loss": 0.3745, "step": 2250 }, { "epoch": 182.17821782178217, "grad_norm": 0.6691053509712219, "learning_rate": 8.5e-05, "loss": 0.3764, "step": 2300 }, { "epoch": 186.13861386138615, "grad_norm": 1.2052526473999023, "learning_rate": 8.25e-05, "loss": 0.3719, "step": 2350 }, { "epoch": 190.0990099009901, "grad_norm": 1.1967781782150269, "learning_rate": 8e-05, "loss": 0.3711, "step": 2400 }, { "epoch": 194.05940594059405, "grad_norm": 0.8571879267692566, "learning_rate": 7.75e-05, "loss": 0.3715, "step": 2450 }, { "epoch": 198.01980198019803, "grad_norm": 1.2208068370819092, "learning_rate": 7.500000000000001e-05, "loss": 0.3653, "step": 2500 }, { "epoch": 198.01980198019803, "eval_loss": 0.3959733843803406, "eval_runtime": 6.5933, "eval_samples_per_second": 27.3, "eval_steps_per_second": 3.488, "step": 2500 }, { "epoch": 201.98019801980197, "grad_norm": 0.9527010917663574, "learning_rate": 7.25e-05, "loss": 0.3657, "step": 2550 }, { "epoch": 205.94059405940595, "grad_norm": 0.8948647975921631, "learning_rate": 7e-05, "loss": 0.3641, "step": 2600 }, { "epoch": 209.9009900990099, "grad_norm": 0.8640767335891724, "learning_rate": 6.750000000000001e-05, "loss": 0.3613, "step": 2650 }, { "epoch": 213.86138613861385, "grad_norm": 0.6822031736373901, "learning_rate": 6.500000000000001e-05, "loss": 0.3635, "step": 2700 }, { "epoch": 217.82178217821783, "grad_norm": 0.6595421433448792, "learning_rate": 6.25e-05, "loss": 0.3609, "step": 2750 }, { "epoch": 221.78217821782178, "grad_norm": 1.1080352067947388, "learning_rate": 6e-05, "loss": 0.3589, "step": 2800 }, { "epoch": 225.74257425742573, "grad_norm": 0.8837751746177673, "learning_rate": 5.7499999999999995e-05, "loss": 0.3588, "step": 2850 }, { "epoch": 229.7029702970297, "grad_norm": 0.6309406757354736, "learning_rate": 5.500000000000001e-05, "loss": 0.3591, "step": 2900 }, { "epoch": 233.66336633663366, "grad_norm": 1.04404616355896, "learning_rate": 5.25e-05, "loss": 0.3591, "step": 2950 }, { "epoch": 237.62376237623764, "grad_norm": 1.1506808996200562, "learning_rate": 5e-05, "loss": 0.3566, "step": 3000 }, { "epoch": 237.62376237623764, "eval_loss": 0.4034203588962555, "eval_runtime": 7.3045, "eval_samples_per_second": 24.642, "eval_steps_per_second": 3.149, "step": 3000 }, { "epoch": 241.58415841584159, "grad_norm": 0.6666073203086853, "learning_rate": 4.75e-05, "loss": 0.3545, "step": 3050 }, { "epoch": 245.54455445544554, "grad_norm": 0.7490206360816956, "learning_rate": 4.5e-05, "loss": 0.3564, "step": 3100 }, { "epoch": 249.5049504950495, "grad_norm": 0.6986690163612366, "learning_rate": 4.25e-05, "loss": 0.355, "step": 3150 }, { "epoch": 253.46534653465346, "grad_norm": 0.45787835121154785, "learning_rate": 4e-05, "loss": 0.3508, "step": 3200 }, { "epoch": 257.4257425742574, "grad_norm": 0.7505810260772705, "learning_rate": 3.7500000000000003e-05, "loss": 0.355, "step": 3250 }, { "epoch": 261.38613861386136, "grad_norm": 0.6505373120307922, "learning_rate": 3.5e-05, "loss": 0.3547, "step": 3300 }, { "epoch": 265.34653465346537, "grad_norm": 0.5026346445083618, "learning_rate": 3.2500000000000004e-05, "loss": 0.353, "step": 3350 }, { "epoch": 269.3069306930693, "grad_norm": 0.7670180797576904, "learning_rate": 3e-05, "loss": 0.3503, "step": 3400 }, { "epoch": 273.26732673267327, "grad_norm": 0.5188130736351013, "learning_rate": 2.7500000000000004e-05, "loss": 0.3503, "step": 3450 }, { "epoch": 277.2277227722772, "grad_norm": 0.5008130073547363, "learning_rate": 2.5e-05, "loss": 0.349, "step": 3500 }, { "epoch": 277.2277227722772, "eval_loss": 0.3990880846977234, "eval_runtime": 7.7645, "eval_samples_per_second": 23.183, "eval_steps_per_second": 2.962, "step": 3500 }, { "epoch": 281.18811881188117, "grad_norm": 0.5325412154197693, "learning_rate": 2.25e-05, "loss": 0.3471, "step": 3550 }, { "epoch": 285.1485148514852, "grad_norm": 0.5069642066955566, "learning_rate": 2e-05, "loss": 0.3475, "step": 3600 }, { "epoch": 289.1089108910891, "grad_norm": 0.6686875224113464, "learning_rate": 1.75e-05, "loss": 0.3472, "step": 3650 }, { "epoch": 293.0693069306931, "grad_norm": 0.49365779757499695, "learning_rate": 1.5e-05, "loss": 0.3471, "step": 3700 }, { "epoch": 297.029702970297, "grad_norm": 0.4248306155204773, "learning_rate": 1.25e-05, "loss": 0.351, "step": 3750 }, { "epoch": 300.990099009901, "grad_norm": 0.5032713413238525, "learning_rate": 1e-05, "loss": 0.3496, "step": 3800 }, { "epoch": 304.9504950495049, "grad_norm": 0.43658721446990967, "learning_rate": 7.5e-06, "loss": 0.3479, "step": 3850 }, { "epoch": 308.91089108910893, "grad_norm": 0.3654322326183319, "learning_rate": 5e-06, "loss": 0.3483, "step": 3900 }, { "epoch": 312.8712871287129, "grad_norm": 0.4196314811706543, "learning_rate": 2.5e-06, "loss": 0.3448, "step": 3950 }, { "epoch": 316.83168316831683, "grad_norm": 0.39204898476600647, "learning_rate": 0.0, "loss": 0.3466, "step": 4000 }, { "epoch": 316.83168316831683, "eval_loss": 0.39805811643600464, "eval_runtime": 6.6963, "eval_samples_per_second": 26.881, "eval_steps_per_second": 3.435, "step": 4000 } ], "logging_steps": 50, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 334, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.643923525044128e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }