{ "best_metric": 0.39598318934440613, "best_model_checkpoint": "mikhail_panzo/ceb_b64_le4_s8000/checkpoint-3500", "epoch": 316.83168316831683, "eval_steps": 500, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.9801980198019802, "grad_norm": 1.725104808807373, "learning_rate": 2.5e-06, "loss": 0.7809, "step": 50 }, { "epoch": 3.9603960396039604, "grad_norm": 1.6038875579833984, "learning_rate": 5e-06, "loss": 0.7005, "step": 100 }, { "epoch": 5.9405940594059405, "grad_norm": 1.6793955564498901, "learning_rate": 7.5e-06, "loss": 0.6246, "step": 150 }, { "epoch": 7.920792079207921, "grad_norm": 1.5051871538162231, "learning_rate": 1e-05, "loss": 0.5278, "step": 200 }, { "epoch": 9.900990099009901, "grad_norm": 1.404683232307434, "learning_rate": 1.25e-05, "loss": 0.5095, "step": 250 }, { "epoch": 11.881188118811881, "grad_norm": 1.248382568359375, "learning_rate": 1.5e-05, "loss": 0.4814, "step": 300 }, { "epoch": 13.861386138613861, "grad_norm": 0.995944082736969, "learning_rate": 1.75e-05, "loss": 0.4743, "step": 350 }, { "epoch": 15.841584158415841, "grad_norm": 1.472835659980774, "learning_rate": 2e-05, "loss": 0.4631, "step": 400 }, { "epoch": 17.821782178217823, "grad_norm": 2.442906618118286, "learning_rate": 2.245e-05, "loss": 0.4622, "step": 450 }, { "epoch": 19.801980198019802, "grad_norm": 1.071074366569519, "learning_rate": 2.495e-05, "loss": 0.4561, "step": 500 }, { "epoch": 19.801980198019802, "eval_loss": 0.41511112451553345, "eval_runtime": 7.4967, "eval_samples_per_second": 24.011, "eval_steps_per_second": 3.068, "step": 500 }, { "epoch": 21.782178217821784, "grad_norm": 1.0462185144424438, "learning_rate": 2.7450000000000003e-05, "loss": 0.4465, "step": 550 }, { "epoch": 23.762376237623762, "grad_norm": 1.103574275970459, "learning_rate": 2.995e-05, "loss": 0.4453, "step": 600 }, { "epoch": 25.742574257425744, "grad_norm": 3.00575590133667, "learning_rate": 3.245e-05, "loss": 0.447, "step": 650 }, { "epoch": 27.722772277227723, "grad_norm": 1.786911129951477, "learning_rate": 3.495e-05, "loss": 0.4351, "step": 700 }, { "epoch": 29.702970297029704, "grad_norm": 1.236941933631897, "learning_rate": 3.745e-05, "loss": 0.4347, "step": 750 }, { "epoch": 31.683168316831683, "grad_norm": 1.3743062019348145, "learning_rate": 3.995e-05, "loss": 0.4319, "step": 800 }, { "epoch": 33.663366336633665, "grad_norm": 2.7615420818328857, "learning_rate": 4.245e-05, "loss": 0.4358, "step": 850 }, { "epoch": 35.64356435643565, "grad_norm": 1.662369966506958, "learning_rate": 4.495e-05, "loss": 0.4276, "step": 900 }, { "epoch": 37.62376237623762, "grad_norm": 1.0967382192611694, "learning_rate": 4.745e-05, "loss": 0.4267, "step": 950 }, { "epoch": 39.603960396039604, "grad_norm": 2.530874252319336, "learning_rate": 4.995e-05, "loss": 0.4179, "step": 1000 }, { "epoch": 39.603960396039604, "eval_loss": 0.39941468834877014, "eval_runtime": 7.4617, "eval_samples_per_second": 24.123, "eval_steps_per_second": 3.082, "step": 1000 }, { "epoch": 41.584158415841586, "grad_norm": 2.8653476238250732, "learning_rate": 5.245e-05, "loss": 0.4268, "step": 1050 }, { "epoch": 43.56435643564357, "grad_norm": 1.5550223588943481, "learning_rate": 5.495e-05, "loss": 0.4265, "step": 1100 }, { "epoch": 45.54455445544554, "grad_norm": 1.804150104522705, "learning_rate": 5.745e-05, "loss": 0.4192, "step": 1150 }, { "epoch": 47.524752475247524, "grad_norm": 1.9916889667510986, "learning_rate": 5.995000000000001e-05, "loss": 0.4149, "step": 1200 }, { "epoch": 49.504950495049506, "grad_norm": 2.1027019023895264, "learning_rate": 6.245000000000001e-05, "loss": 0.4203, "step": 1250 }, { "epoch": 51.48514851485149, "grad_norm": 1.1542466878890991, "learning_rate": 6.494999999999999e-05, "loss": 0.4127, "step": 1300 }, { "epoch": 53.46534653465346, "grad_norm": 1.8733513355255127, "learning_rate": 6.745e-05, "loss": 0.4165, "step": 1350 }, { "epoch": 55.445544554455445, "grad_norm": 2.544435739517212, "learning_rate": 6.995e-05, "loss": 0.4156, "step": 1400 }, { "epoch": 57.42574257425743, "grad_norm": 2.9764773845672607, "learning_rate": 7.245000000000001e-05, "loss": 0.4045, "step": 1450 }, { "epoch": 59.40594059405941, "grad_norm": 1.334035038948059, "learning_rate": 7.495e-05, "loss": 0.4075, "step": 1500 }, { "epoch": 59.40594059405941, "eval_loss": 0.40177610516548157, "eval_runtime": 6.8721, "eval_samples_per_second": 26.193, "eval_steps_per_second": 3.347, "step": 1500 }, { "epoch": 61.386138613861384, "grad_norm": 2.3007051944732666, "learning_rate": 7.745e-05, "loss": 0.4067, "step": 1550 }, { "epoch": 63.366336633663366, "grad_norm": 0.9966986179351807, "learning_rate": 7.995e-05, "loss": 0.4042, "step": 1600 }, { "epoch": 65.34653465346534, "grad_norm": 1.4066482782363892, "learning_rate": 8.245e-05, "loss": 0.4079, "step": 1650 }, { "epoch": 67.32673267326733, "grad_norm": 3.3195865154266357, "learning_rate": 8.495e-05, "loss": 0.4061, "step": 1700 }, { "epoch": 69.3069306930693, "grad_norm": 2.83154559135437, "learning_rate": 8.745000000000001e-05, "loss": 0.4028, "step": 1750 }, { "epoch": 71.2871287128713, "grad_norm": 1.5752816200256348, "learning_rate": 8.995e-05, "loss": 0.3977, "step": 1800 }, { "epoch": 73.26732673267327, "grad_norm": 1.8909986019134521, "learning_rate": 9.245e-05, "loss": 0.4013, "step": 1850 }, { "epoch": 75.24752475247524, "grad_norm": 4.082262992858887, "learning_rate": 9.495e-05, "loss": 0.3991, "step": 1900 }, { "epoch": 77.22772277227723, "grad_norm": 1.8281221389770508, "learning_rate": 9.745000000000001e-05, "loss": 0.4011, "step": 1950 }, { "epoch": 79.20792079207921, "grad_norm": 2.2827675342559814, "learning_rate": 9.995e-05, "loss": 0.3981, "step": 2000 }, { "epoch": 79.20792079207921, "eval_loss": 0.40288153290748596, "eval_runtime": 7.8052, "eval_samples_per_second": 23.062, "eval_steps_per_second": 2.947, "step": 2000 }, { "epoch": 81.18811881188118, "grad_norm": 2.6100072860717773, "learning_rate": 9.918333333333334e-05, "loss": 0.3996, "step": 2050 }, { "epoch": 83.16831683168317, "grad_norm": 1.1003444194793701, "learning_rate": 9.835e-05, "loss": 0.3999, "step": 2100 }, { "epoch": 85.14851485148515, "grad_norm": 1.4783449172973633, "learning_rate": 9.751666666666666e-05, "loss": 0.3951, "step": 2150 }, { "epoch": 87.12871287128714, "grad_norm": 2.3728928565979004, "learning_rate": 9.668333333333334e-05, "loss": 0.3831, "step": 2200 }, { "epoch": 89.10891089108911, "grad_norm": 1.2834324836730957, "learning_rate": 9.585000000000001e-05, "loss": 0.3869, "step": 2250 }, { "epoch": 91.08910891089108, "grad_norm": 1.771146535873413, "learning_rate": 9.501666666666668e-05, "loss": 0.3896, "step": 2300 }, { "epoch": 93.06930693069307, "grad_norm": 2.136204481124878, "learning_rate": 9.418333333333334e-05, "loss": 0.3898, "step": 2350 }, { "epoch": 95.04950495049505, "grad_norm": 0.8848810791969299, "learning_rate": 9.335e-05, "loss": 0.3875, "step": 2400 }, { "epoch": 97.02970297029702, "grad_norm": 1.2002694606781006, "learning_rate": 9.251666666666667e-05, "loss": 0.3808, "step": 2450 }, { "epoch": 99.00990099009901, "grad_norm": 1.392091155052185, "learning_rate": 9.168333333333333e-05, "loss": 0.3862, "step": 2500 }, { "epoch": 99.00990099009901, "eval_loss": 0.39783453941345215, "eval_runtime": 7.729, "eval_samples_per_second": 23.289, "eval_steps_per_second": 2.976, "step": 2500 }, { "epoch": 100.99009900990099, "grad_norm": 1.1166267395019531, "learning_rate": 9.085e-05, "loss": 0.3824, "step": 2550 }, { "epoch": 102.97029702970298, "grad_norm": 1.4629709720611572, "learning_rate": 9.001666666666667e-05, "loss": 0.3829, "step": 2600 }, { "epoch": 104.95049504950495, "grad_norm": 2.9931211471557617, "learning_rate": 8.918333333333334e-05, "loss": 0.3756, "step": 2650 }, { "epoch": 106.93069306930693, "grad_norm": 1.6760491132736206, "learning_rate": 8.834999999999999e-05, "loss": 0.3815, "step": 2700 }, { "epoch": 108.91089108910892, "grad_norm": 1.8942713737487793, "learning_rate": 8.751666666666668e-05, "loss": 0.3773, "step": 2750 }, { "epoch": 110.89108910891089, "grad_norm": 1.110032081604004, "learning_rate": 8.668333333333334e-05, "loss": 0.3747, "step": 2800 }, { "epoch": 112.87128712871286, "grad_norm": 1.3915964365005493, "learning_rate": 8.585000000000001e-05, "loss": 0.3796, "step": 2850 }, { "epoch": 114.85148514851485, "grad_norm": 2.8676748275756836, "learning_rate": 8.501666666666667e-05, "loss": 0.3731, "step": 2900 }, { "epoch": 116.83168316831683, "grad_norm": 1.0008431673049927, "learning_rate": 8.418333333333334e-05, "loss": 0.3747, "step": 2950 }, { "epoch": 118.81188118811882, "grad_norm": 2.071352243423462, "learning_rate": 8.335e-05, "loss": 0.3726, "step": 3000 }, { "epoch": 118.81188118811882, "eval_loss": 0.3978251516819, "eval_runtime": 8.1696, "eval_samples_per_second": 22.033, "eval_steps_per_second": 2.815, "step": 3000 }, { "epoch": 120.79207920792079, "grad_norm": 0.8712412118911743, "learning_rate": 8.251666666666668e-05, "loss": 0.3675, "step": 3050 }, { "epoch": 122.77227722772277, "grad_norm": 4.452208042144775, "learning_rate": 8.168333333333333e-05, "loss": 0.3687, "step": 3100 }, { "epoch": 124.75247524752476, "grad_norm": 2.735180377960205, "learning_rate": 8.085e-05, "loss": 0.3749, "step": 3150 }, { "epoch": 126.73267326732673, "grad_norm": 2.1853744983673096, "learning_rate": 8.001666666666667e-05, "loss": 0.3733, "step": 3200 }, { "epoch": 128.7128712871287, "grad_norm": 3.216191530227661, "learning_rate": 7.918333333333334e-05, "loss": 0.369, "step": 3250 }, { "epoch": 130.69306930693068, "grad_norm": 1.2702809572219849, "learning_rate": 7.835000000000001e-05, "loss": 0.3673, "step": 3300 }, { "epoch": 132.67326732673268, "grad_norm": 2.0314784049987793, "learning_rate": 7.751666666666668e-05, "loss": 0.3671, "step": 3350 }, { "epoch": 134.65346534653466, "grad_norm": 2.0706610679626465, "learning_rate": 7.668333333333335e-05, "loss": 0.3625, "step": 3400 }, { "epoch": 136.63366336633663, "grad_norm": 1.2799315452575684, "learning_rate": 7.585e-05, "loss": 0.3646, "step": 3450 }, { "epoch": 138.6138613861386, "grad_norm": 1.2347270250320435, "learning_rate": 7.501666666666667e-05, "loss": 0.365, "step": 3500 }, { "epoch": 138.6138613861386, "eval_loss": 0.39598318934440613, "eval_runtime": 7.6328, "eval_samples_per_second": 23.582, "eval_steps_per_second": 3.013, "step": 3500 }, { "epoch": 140.59405940594058, "grad_norm": 2.1505396366119385, "learning_rate": 7.418333333333334e-05, "loss": 0.367, "step": 3550 }, { "epoch": 142.5742574257426, "grad_norm": 1.6036536693572998, "learning_rate": 7.335000000000001e-05, "loss": 0.3622, "step": 3600 }, { "epoch": 144.55445544554456, "grad_norm": 1.1357529163360596, "learning_rate": 7.251666666666666e-05, "loss": 0.3589, "step": 3650 }, { "epoch": 146.53465346534654, "grad_norm": 1.5478957891464233, "learning_rate": 7.168333333333333e-05, "loss": 0.3577, "step": 3700 }, { "epoch": 148.5148514851485, "grad_norm": 1.0070338249206543, "learning_rate": 7.085e-05, "loss": 0.3582, "step": 3750 }, { "epoch": 150.4950495049505, "grad_norm": 0.9300253987312317, "learning_rate": 7.001666666666667e-05, "loss": 0.3563, "step": 3800 }, { "epoch": 152.47524752475246, "grad_norm": 0.9197555184364319, "learning_rate": 6.918333333333334e-05, "loss": 0.3514, "step": 3850 }, { "epoch": 154.45544554455446, "grad_norm": 0.6059859991073608, "learning_rate": 6.835000000000001e-05, "loss": 0.3575, "step": 3900 }, { "epoch": 156.43564356435644, "grad_norm": 0.7884564399719238, "learning_rate": 6.751666666666668e-05, "loss": 0.3613, "step": 3950 }, { "epoch": 158.41584158415841, "grad_norm": 0.7471904754638672, "learning_rate": 6.668333333333333e-05, "loss": 0.3525, "step": 4000 }, { "epoch": 158.41584158415841, "eval_loss": 0.39685142040252686, "eval_runtime": 7.1693, "eval_samples_per_second": 25.107, "eval_steps_per_second": 3.208, "step": 4000 }, { "epoch": 160.3960396039604, "grad_norm": 0.9373750686645508, "learning_rate": 6.585e-05, "loss": 0.3537, "step": 4050 }, { "epoch": 162.37623762376236, "grad_norm": 1.3369851112365723, "learning_rate": 6.501666666666667e-05, "loss": 0.3585, "step": 4100 }, { "epoch": 164.35643564356437, "grad_norm": 0.6891220211982727, "learning_rate": 6.418333333333334e-05, "loss": 0.3519, "step": 4150 }, { "epoch": 166.33663366336634, "grad_norm": 0.8272483944892883, "learning_rate": 6.335e-05, "loss": 0.3542, "step": 4200 }, { "epoch": 168.31683168316832, "grad_norm": 0.9853746891021729, "learning_rate": 6.251666666666666e-05, "loss": 0.3553, "step": 4250 }, { "epoch": 170.2970297029703, "grad_norm": 1.0020989179611206, "learning_rate": 6.168333333333333e-05, "loss": 0.3558, "step": 4300 }, { "epoch": 172.27722772277227, "grad_norm": 1.4780181646347046, "learning_rate": 6.085000000000001e-05, "loss": 0.3505, "step": 4350 }, { "epoch": 174.25742574257427, "grad_norm": 0.9966872334480286, "learning_rate": 6.0016666666666664e-05, "loss": 0.3513, "step": 4400 }, { "epoch": 176.23762376237624, "grad_norm": 1.2055169343948364, "learning_rate": 5.918333333333333e-05, "loss": 0.3509, "step": 4450 }, { "epoch": 178.21782178217822, "grad_norm": 1.075426697731018, "learning_rate": 5.835e-05, "loss": 0.3545, "step": 4500 }, { "epoch": 178.21782178217822, "eval_loss": 0.3981594443321228, "eval_runtime": 6.8387, "eval_samples_per_second": 26.321, "eval_steps_per_second": 3.363, "step": 4500 }, { "epoch": 180.1980198019802, "grad_norm": 1.0541815757751465, "learning_rate": 5.751666666666667e-05, "loss": 0.3473, "step": 4550 }, { "epoch": 182.17821782178217, "grad_norm": 2.1192638874053955, "learning_rate": 5.668333333333333e-05, "loss": 0.348, "step": 4600 }, { "epoch": 184.15841584158414, "grad_norm": 1.2069100141525269, "learning_rate": 5.585e-05, "loss": 0.3463, "step": 4650 }, { "epoch": 186.13861386138615, "grad_norm": 0.9461864233016968, "learning_rate": 5.501666666666667e-05, "loss": 0.3471, "step": 4700 }, { "epoch": 188.11881188118812, "grad_norm": 1.0580745935440063, "learning_rate": 5.4183333333333334e-05, "loss": 0.3485, "step": 4750 }, { "epoch": 190.0990099009901, "grad_norm": 0.7629022002220154, "learning_rate": 5.335e-05, "loss": 0.346, "step": 4800 }, { "epoch": 192.07920792079207, "grad_norm": 0.7628908753395081, "learning_rate": 5.251666666666667e-05, "loss": 0.3487, "step": 4850 }, { "epoch": 194.05940594059405, "grad_norm": 1.024609088897705, "learning_rate": 5.168333333333334e-05, "loss": 0.3486, "step": 4900 }, { "epoch": 196.03960396039605, "grad_norm": 0.8158652186393738, "learning_rate": 5.0849999999999996e-05, "loss": 0.3456, "step": 4950 }, { "epoch": 198.01980198019803, "grad_norm": 1.0953030586242676, "learning_rate": 5.0016666666666665e-05, "loss": 0.3473, "step": 5000 }, { "epoch": 198.01980198019803, "eval_loss": 0.40393778681755066, "eval_runtime": 6.9458, "eval_samples_per_second": 25.915, "eval_steps_per_second": 3.311, "step": 5000 }, { "epoch": 200.0, "grad_norm": 1.864687442779541, "learning_rate": 4.9183333333333334e-05, "loss": 0.3484, "step": 5050 }, { "epoch": 201.98019801980197, "grad_norm": 1.406449556350708, "learning_rate": 4.835e-05, "loss": 0.345, "step": 5100 }, { "epoch": 203.96039603960395, "grad_norm": 0.7522682547569275, "learning_rate": 4.751666666666667e-05, "loss": 0.3468, "step": 5150 }, { "epoch": 205.94059405940595, "grad_norm": 0.5859296321868896, "learning_rate": 4.6683333333333334e-05, "loss": 0.3432, "step": 5200 }, { "epoch": 207.92079207920793, "grad_norm": 0.6594001054763794, "learning_rate": 4.585e-05, "loss": 0.3417, "step": 5250 }, { "epoch": 209.9009900990099, "grad_norm": 1.0125696659088135, "learning_rate": 4.5016666666666665e-05, "loss": 0.3428, "step": 5300 }, { "epoch": 211.88118811881188, "grad_norm": 0.8519133925437927, "learning_rate": 4.4183333333333334e-05, "loss": 0.3424, "step": 5350 }, { "epoch": 213.86138613861385, "grad_norm": 0.8138070106506348, "learning_rate": 4.335e-05, "loss": 0.3411, "step": 5400 }, { "epoch": 215.84158415841586, "grad_norm": 1.7046844959259033, "learning_rate": 4.251666666666667e-05, "loss": 0.3418, "step": 5450 }, { "epoch": 217.82178217821783, "grad_norm": 0.8346728682518005, "learning_rate": 4.1683333333333335e-05, "loss": 0.3439, "step": 5500 }, { "epoch": 217.82178217821783, "eval_loss": 0.40201354026794434, "eval_runtime": 7.7869, "eval_samples_per_second": 23.116, "eval_steps_per_second": 2.954, "step": 5500 }, { "epoch": 219.8019801980198, "grad_norm": 0.7159820199012756, "learning_rate": 4.085e-05, "loss": 0.3419, "step": 5550 }, { "epoch": 221.78217821782178, "grad_norm": 1.4013868570327759, "learning_rate": 4.0016666666666666e-05, "loss": 0.3358, "step": 5600 }, { "epoch": 223.76237623762376, "grad_norm": 1.4386184215545654, "learning_rate": 3.9183333333333335e-05, "loss": 0.3457, "step": 5650 }, { "epoch": 225.74257425742573, "grad_norm": 1.1353213787078857, "learning_rate": 3.8350000000000004e-05, "loss": 0.3405, "step": 5700 }, { "epoch": 227.72277227722773, "grad_norm": 1.091909646987915, "learning_rate": 3.7516666666666666e-05, "loss": 0.3403, "step": 5750 }, { "epoch": 229.7029702970297, "grad_norm": 0.8275148272514343, "learning_rate": 3.6683333333333335e-05, "loss": 0.3404, "step": 5800 }, { "epoch": 231.68316831683168, "grad_norm": 0.6606130599975586, "learning_rate": 3.585e-05, "loss": 0.3416, "step": 5850 }, { "epoch": 233.66336633663366, "grad_norm": 1.0569533109664917, "learning_rate": 3.501666666666667e-05, "loss": 0.3404, "step": 5900 }, { "epoch": 235.64356435643563, "grad_norm": 0.8686895370483398, "learning_rate": 3.4183333333333335e-05, "loss": 0.3397, "step": 5950 }, { "epoch": 237.62376237623764, "grad_norm": 0.8039170503616333, "learning_rate": 3.3350000000000004e-05, "loss": 0.3371, "step": 6000 }, { "epoch": 237.62376237623764, "eval_loss": 0.4044432044029236, "eval_runtime": 7.7329, "eval_samples_per_second": 23.277, "eval_steps_per_second": 2.974, "step": 6000 }, { "epoch": 239.6039603960396, "grad_norm": 0.5451411604881287, "learning_rate": 3.2516666666666666e-05, "loss": 0.3394, "step": 6050 }, { "epoch": 241.58415841584159, "grad_norm": 0.6792750954627991, "learning_rate": 3.1683333333333335e-05, "loss": 0.3379, "step": 6100 }, { "epoch": 243.56435643564356, "grad_norm": 0.6445412635803223, "learning_rate": 3.0850000000000004e-05, "loss": 0.3389, "step": 6150 }, { "epoch": 245.54455445544554, "grad_norm": 0.9960897564888, "learning_rate": 3.001666666666667e-05, "loss": 0.3352, "step": 6200 }, { "epoch": 247.52475247524754, "grad_norm": 0.7753505110740662, "learning_rate": 2.9183333333333336e-05, "loss": 0.3375, "step": 6250 }, { "epoch": 249.5049504950495, "grad_norm": 0.5568383932113647, "learning_rate": 2.8349999999999998e-05, "loss": 0.3386, "step": 6300 }, { "epoch": 251.4851485148515, "grad_norm": 0.6036835312843323, "learning_rate": 2.7516666666666667e-05, "loss": 0.3356, "step": 6350 }, { "epoch": 253.46534653465346, "grad_norm": 1.170256495475769, "learning_rate": 2.6683333333333333e-05, "loss": 0.3327, "step": 6400 }, { "epoch": 255.44554455445544, "grad_norm": 0.6887166500091553, "learning_rate": 2.585e-05, "loss": 0.3373, "step": 6450 }, { "epoch": 257.4257425742574, "grad_norm": 0.6323124170303345, "learning_rate": 2.5016666666666667e-05, "loss": 0.3362, "step": 6500 }, { "epoch": 257.4257425742574, "eval_loss": 0.40408840775489807, "eval_runtime": 7.7698, "eval_samples_per_second": 23.167, "eval_steps_per_second": 2.96, "step": 6500 }, { "epoch": 259.4059405940594, "grad_norm": 0.7631197571754456, "learning_rate": 2.4183333333333336e-05, "loss": 0.3325, "step": 6550 }, { "epoch": 261.38613861386136, "grad_norm": 0.6006826162338257, "learning_rate": 2.3350000000000002e-05, "loss": 0.3348, "step": 6600 }, { "epoch": 263.36633663366337, "grad_norm": 0.7407628297805786, "learning_rate": 2.2516666666666667e-05, "loss": 0.3317, "step": 6650 }, { "epoch": 265.34653465346537, "grad_norm": 0.5582762956619263, "learning_rate": 2.1683333333333333e-05, "loss": 0.3334, "step": 6700 }, { "epoch": 267.3267326732673, "grad_norm": 0.4441429674625397, "learning_rate": 2.085e-05, "loss": 0.3308, "step": 6750 }, { "epoch": 269.3069306930693, "grad_norm": 0.6358359456062317, "learning_rate": 2.0016666666666668e-05, "loss": 0.3302, "step": 6800 }, { "epoch": 271.28712871287127, "grad_norm": 0.5992699861526489, "learning_rate": 1.9183333333333333e-05, "loss": 0.3335, "step": 6850 }, { "epoch": 273.26732673267327, "grad_norm": 0.49822068214416504, "learning_rate": 1.8350000000000002e-05, "loss": 0.3325, "step": 6900 }, { "epoch": 275.2475247524753, "grad_norm": 0.6612289547920227, "learning_rate": 1.7516666666666668e-05, "loss": 0.3373, "step": 6950 }, { "epoch": 277.2277227722772, "grad_norm": 0.5066806674003601, "learning_rate": 1.6683333333333333e-05, "loss": 0.3311, "step": 7000 }, { "epoch": 277.2277227722772, "eval_loss": 0.4022347033023834, "eval_runtime": 6.5767, "eval_samples_per_second": 27.369, "eval_steps_per_second": 3.497, "step": 7000 }, { "epoch": 279.2079207920792, "grad_norm": 0.5922915935516357, "learning_rate": 1.5850000000000002e-05, "loss": 0.331, "step": 7050 }, { "epoch": 281.18811881188117, "grad_norm": 0.49854084849357605, "learning_rate": 1.5016666666666668e-05, "loss": 0.3292, "step": 7100 }, { "epoch": 283.16831683168317, "grad_norm": 0.534227192401886, "learning_rate": 1.4183333333333335e-05, "loss": 0.3295, "step": 7150 }, { "epoch": 285.1485148514852, "grad_norm": 0.4879334568977356, "learning_rate": 1.3350000000000001e-05, "loss": 0.3295, "step": 7200 }, { "epoch": 287.1287128712871, "grad_norm": 0.4761298596858978, "learning_rate": 1.2516666666666668e-05, "loss": 0.333, "step": 7250 }, { "epoch": 289.1089108910891, "grad_norm": 0.5835270881652832, "learning_rate": 1.1683333333333334e-05, "loss": 0.3311, "step": 7300 }, { "epoch": 291.08910891089107, "grad_norm": 0.5297247767448425, "learning_rate": 1.0866666666666667e-05, "loss": 0.333, "step": 7350 }, { "epoch": 293.0693069306931, "grad_norm": 0.44668009877204895, "learning_rate": 1.0033333333333333e-05, "loss": 0.3282, "step": 7400 }, { "epoch": 295.0495049504951, "grad_norm": 0.47231703996658325, "learning_rate": 9.2e-06, "loss": 0.3309, "step": 7450 }, { "epoch": 297.029702970297, "grad_norm": 0.5559085011482239, "learning_rate": 8.366666666666667e-06, "loss": 0.3345, "step": 7500 }, { "epoch": 297.029702970297, "eval_loss": 0.40512633323669434, "eval_runtime": 6.9059, "eval_samples_per_second": 26.065, "eval_steps_per_second": 3.33, "step": 7500 }, { "epoch": 299.009900990099, "grad_norm": 0.5674709677696228, "learning_rate": 7.533333333333334e-06, "loss": 0.3317, "step": 7550 }, { "epoch": 300.990099009901, "grad_norm": 0.5428618788719177, "learning_rate": 6.700000000000001e-06, "loss": 0.3322, "step": 7600 }, { "epoch": 302.970297029703, "grad_norm": 0.6271554827690125, "learning_rate": 5.866666666666667e-06, "loss": 0.3337, "step": 7650 }, { "epoch": 304.9504950495049, "grad_norm": 0.41911429166793823, "learning_rate": 5.033333333333334e-06, "loss": 0.329, "step": 7700 }, { "epoch": 306.9306930693069, "grad_norm": 0.4316006600856781, "learning_rate": 4.2000000000000004e-06, "loss": 0.3338, "step": 7750 }, { "epoch": 308.91089108910893, "grad_norm": 0.5471222400665283, "learning_rate": 3.3666666666666665e-06, "loss": 0.3316, "step": 7800 }, { "epoch": 310.8910891089109, "grad_norm": 0.5605342388153076, "learning_rate": 2.5333333333333334e-06, "loss": 0.3289, "step": 7850 }, { "epoch": 312.8712871287129, "grad_norm": 0.5504734516143799, "learning_rate": 1.7000000000000002e-06, "loss": 0.3303, "step": 7900 }, { "epoch": 314.8514851485148, "grad_norm": 0.5514795780181885, "learning_rate": 8.666666666666667e-07, "loss": 0.3282, "step": 7950 }, { "epoch": 316.83168316831683, "grad_norm": 0.5700021982192993, "learning_rate": 3.3333333333333334e-08, "loss": 0.3348, "step": 8000 }, { "epoch": 316.83168316831683, "eval_loss": 0.4050144553184509, "eval_runtime": 6.8387, "eval_samples_per_second": 26.321, "eval_steps_per_second": 3.363, "step": 8000 } ], "logging_steps": 50, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 320, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.643923525044128e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }