|
{ |
|
"best_metric": 0.39249518513679504, |
|
"best_model_checkpoint": "mikhail-panzo/ceb_b64_le5_s8000/checkpoint-6000", |
|
"epoch": 313.72549019607845, |
|
"eval_steps": 500, |
|
"global_step": 8000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.9607843137254903, |
|
"grad_norm": 2.2592813968658447, |
|
"learning_rate": 2.4500000000000004e-07, |
|
"loss": 0.8071, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 3.9215686274509802, |
|
"grad_norm": 1.564003348350525, |
|
"learning_rate": 4.95e-07, |
|
"loss": 0.7677, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 5.882352941176471, |
|
"grad_norm": 1.3204560279846191, |
|
"learning_rate": 7.450000000000001e-07, |
|
"loss": 0.7469, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 7.8431372549019605, |
|
"grad_norm": 1.3252646923065186, |
|
"learning_rate": 9.950000000000002e-07, |
|
"loss": 0.7234, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 9.803921568627452, |
|
"grad_norm": 1.9049391746520996, |
|
"learning_rate": 1.2450000000000002e-06, |
|
"loss": 0.7167, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 11.764705882352942, |
|
"grad_norm": 1.5868775844573975, |
|
"learning_rate": 1.495e-06, |
|
"loss": 0.6949, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 13.72549019607843, |
|
"grad_norm": 2.022406578063965, |
|
"learning_rate": 1.745e-06, |
|
"loss": 0.6581, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 15.686274509803921, |
|
"grad_norm": 1.601341724395752, |
|
"learning_rate": 1.9950000000000004e-06, |
|
"loss": 0.6181, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 17.647058823529413, |
|
"grad_norm": 1.671892523765564, |
|
"learning_rate": 2.245e-06, |
|
"loss": 0.5798, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 19.607843137254903, |
|
"grad_norm": 1.9054476022720337, |
|
"learning_rate": 2.4950000000000003e-06, |
|
"loss": 0.5525, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 19.607843137254903, |
|
"eval_loss": 0.4685615003108978, |
|
"eval_runtime": 6.3848, |
|
"eval_samples_per_second": 28.192, |
|
"eval_steps_per_second": 3.602, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 21.568627450980394, |
|
"grad_norm": 1.5475994348526, |
|
"learning_rate": 2.7450000000000004e-06, |
|
"loss": 0.5407, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 23.529411764705884, |
|
"grad_norm": 1.1005382537841797, |
|
"learning_rate": 2.995e-06, |
|
"loss": 0.5076, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 25.49019607843137, |
|
"grad_norm": 0.979748547077179, |
|
"learning_rate": 3.2450000000000003e-06, |
|
"loss": 0.5071, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 27.45098039215686, |
|
"grad_norm": 1.0884859561920166, |
|
"learning_rate": 3.495e-06, |
|
"loss": 0.4934, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 29.41176470588235, |
|
"grad_norm": 1.8463184833526611, |
|
"learning_rate": 3.745e-06, |
|
"loss": 0.4919, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 31.372549019607842, |
|
"grad_norm": 1.2262991666793823, |
|
"learning_rate": 3.995000000000001e-06, |
|
"loss": 0.4981, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 33.333333333333336, |
|
"grad_norm": 1.0878300666809082, |
|
"learning_rate": 4.245e-06, |
|
"loss": 0.4928, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 35.294117647058826, |
|
"grad_norm": 1.155561089515686, |
|
"learning_rate": 4.495e-06, |
|
"loss": 0.4848, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 37.254901960784316, |
|
"grad_norm": 1.4933425188064575, |
|
"learning_rate": 4.745e-06, |
|
"loss": 0.4754, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 39.21568627450981, |
|
"grad_norm": 1.239863634109497, |
|
"learning_rate": 4.9950000000000005e-06, |
|
"loss": 0.4756, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 39.21568627450981, |
|
"eval_loss": 0.42764008045196533, |
|
"eval_runtime": 6.3802, |
|
"eval_samples_per_second": 28.212, |
|
"eval_steps_per_second": 3.605, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 41.1764705882353, |
|
"grad_norm": 1.9258134365081787, |
|
"learning_rate": 5.245e-06, |
|
"loss": 0.4653, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 43.13725490196079, |
|
"grad_norm": 0.9576085805892944, |
|
"learning_rate": 5.495000000000001e-06, |
|
"loss": 0.465, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 45.09803921568628, |
|
"grad_norm": 0.9215015769004822, |
|
"learning_rate": 5.745000000000001e-06, |
|
"loss": 0.4743, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 47.05882352941177, |
|
"grad_norm": 1.1917026042938232, |
|
"learning_rate": 5.995000000000001e-06, |
|
"loss": 0.4581, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 49.01960784313726, |
|
"grad_norm": 1.30499267578125, |
|
"learning_rate": 6.245000000000001e-06, |
|
"loss": 0.4626, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 50.98039215686274, |
|
"grad_norm": 1.4004439115524292, |
|
"learning_rate": 6.4950000000000005e-06, |
|
"loss": 0.4591, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 52.94117647058823, |
|
"grad_norm": 0.892646074295044, |
|
"learning_rate": 6.745000000000001e-06, |
|
"loss": 0.4545, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 54.90196078431372, |
|
"grad_norm": 1.0985666513442993, |
|
"learning_rate": 6.995000000000001e-06, |
|
"loss": 0.4501, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 56.86274509803921, |
|
"grad_norm": 1.271141529083252, |
|
"learning_rate": 7.245000000000001e-06, |
|
"loss": 0.4456, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 58.8235294117647, |
|
"grad_norm": 1.170074224472046, |
|
"learning_rate": 7.495000000000001e-06, |
|
"loss": 0.4543, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 58.8235294117647, |
|
"eval_loss": 0.4116212725639343, |
|
"eval_runtime": 6.2794, |
|
"eval_samples_per_second": 28.665, |
|
"eval_steps_per_second": 3.663, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 60.78431372549019, |
|
"grad_norm": 0.9767690896987915, |
|
"learning_rate": 7.745e-06, |
|
"loss": 0.4517, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 62.745098039215684, |
|
"grad_norm": 1.4193260669708252, |
|
"learning_rate": 7.995e-06, |
|
"loss": 0.4537, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 64.70588235294117, |
|
"grad_norm": 1.3294360637664795, |
|
"learning_rate": 8.245000000000002e-06, |
|
"loss": 0.4435, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 66.66666666666667, |
|
"grad_norm": 0.8386899828910828, |
|
"learning_rate": 8.495e-06, |
|
"loss": 0.4507, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 68.62745098039215, |
|
"grad_norm": 1.0917119979858398, |
|
"learning_rate": 8.745000000000002e-06, |
|
"loss": 0.4409, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 70.58823529411765, |
|
"grad_norm": 1.0725489854812622, |
|
"learning_rate": 8.995000000000001e-06, |
|
"loss": 0.4449, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 72.54901960784314, |
|
"grad_norm": 1.3506999015808105, |
|
"learning_rate": 9.245e-06, |
|
"loss": 0.4496, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 74.50980392156863, |
|
"grad_norm": 0.9701379537582397, |
|
"learning_rate": 9.495000000000001e-06, |
|
"loss": 0.4384, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 76.47058823529412, |
|
"grad_norm": 1.7079219818115234, |
|
"learning_rate": 9.745e-06, |
|
"loss": 0.4374, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 78.43137254901961, |
|
"grad_norm": 1.87998628616333, |
|
"learning_rate": 9.995000000000002e-06, |
|
"loss": 0.4346, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 78.43137254901961, |
|
"eval_loss": 0.4027920663356781, |
|
"eval_runtime": 6.3546, |
|
"eval_samples_per_second": 28.326, |
|
"eval_steps_per_second": 3.619, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 80.3921568627451, |
|
"grad_norm": 1.1510419845581055, |
|
"learning_rate": 9.918333333333335e-06, |
|
"loss": 0.4326, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 82.3529411764706, |
|
"grad_norm": 1.2605654001235962, |
|
"learning_rate": 9.835000000000002e-06, |
|
"loss": 0.4355, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 84.31372549019608, |
|
"grad_norm": 0.866606593132019, |
|
"learning_rate": 9.751666666666667e-06, |
|
"loss": 0.4286, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 86.27450980392157, |
|
"grad_norm": 2.0733227729797363, |
|
"learning_rate": 9.668333333333334e-06, |
|
"loss": 0.4365, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 88.23529411764706, |
|
"grad_norm": 0.9726402759552002, |
|
"learning_rate": 9.585e-06, |
|
"loss": 0.4367, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 90.19607843137256, |
|
"grad_norm": 1.0713222026824951, |
|
"learning_rate": 9.501666666666667e-06, |
|
"loss": 0.4288, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 92.15686274509804, |
|
"grad_norm": 1.5218483209609985, |
|
"learning_rate": 9.418333333333334e-06, |
|
"loss": 0.435, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 94.11764705882354, |
|
"grad_norm": 0.8391968011856079, |
|
"learning_rate": 9.335000000000001e-06, |
|
"loss": 0.431, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 96.07843137254902, |
|
"grad_norm": 1.3989890813827515, |
|
"learning_rate": 9.251666666666668e-06, |
|
"loss": 0.4251, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 98.03921568627452, |
|
"grad_norm": 0.9168123006820679, |
|
"learning_rate": 9.168333333333333e-06, |
|
"loss": 0.4292, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 98.03921568627452, |
|
"eval_loss": 0.3997121751308441, |
|
"eval_runtime": 6.4036, |
|
"eval_samples_per_second": 28.109, |
|
"eval_steps_per_second": 3.592, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 2.315229892730713, |
|
"learning_rate": 9.085e-06, |
|
"loss": 0.4295, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 101.96078431372548, |
|
"grad_norm": 1.0224589109420776, |
|
"learning_rate": 9.001666666666667e-06, |
|
"loss": 0.4212, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 103.92156862745098, |
|
"grad_norm": 1.389236330986023, |
|
"learning_rate": 8.918333333333334e-06, |
|
"loss": 0.4215, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 105.88235294117646, |
|
"grad_norm": 0.9601902961730957, |
|
"learning_rate": 8.836666666666668e-06, |
|
"loss": 0.4271, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 107.84313725490196, |
|
"grad_norm": 0.8070810437202454, |
|
"learning_rate": 8.753333333333333e-06, |
|
"loss": 0.4269, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 109.80392156862744, |
|
"grad_norm": 1.3389467000961304, |
|
"learning_rate": 8.67e-06, |
|
"loss": 0.4209, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 111.76470588235294, |
|
"grad_norm": 1.0083783864974976, |
|
"learning_rate": 8.586666666666667e-06, |
|
"loss": 0.4143, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 113.72549019607843, |
|
"grad_norm": 1.390769600868225, |
|
"learning_rate": 8.503333333333334e-06, |
|
"loss": 0.4208, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 115.68627450980392, |
|
"grad_norm": 1.0967605113983154, |
|
"learning_rate": 8.42e-06, |
|
"loss": 0.4127, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 117.6470588235294, |
|
"grad_norm": 0.9284217357635498, |
|
"learning_rate": 8.336666666666668e-06, |
|
"loss": 0.4166, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 117.6470588235294, |
|
"eval_loss": 0.395207017660141, |
|
"eval_runtime": 6.3246, |
|
"eval_samples_per_second": 28.46, |
|
"eval_steps_per_second": 3.637, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 119.6078431372549, |
|
"grad_norm": 1.1663891077041626, |
|
"learning_rate": 8.253333333333334e-06, |
|
"loss": 0.4155, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 121.56862745098039, |
|
"grad_norm": 5.156078338623047, |
|
"learning_rate": 8.17e-06, |
|
"loss": 0.4156, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 123.52941176470588, |
|
"grad_norm": 0.9861869215965271, |
|
"learning_rate": 8.086666666666667e-06, |
|
"loss": 0.4163, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 125.49019607843137, |
|
"grad_norm": 0.8449072241783142, |
|
"learning_rate": 8.003333333333334e-06, |
|
"loss": 0.4176, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 127.45098039215686, |
|
"grad_norm": 1.300947904586792, |
|
"learning_rate": 7.92e-06, |
|
"loss": 0.4245, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 129.41176470588235, |
|
"grad_norm": 0.7830358147621155, |
|
"learning_rate": 7.836666666666667e-06, |
|
"loss": 0.4124, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 131.37254901960785, |
|
"grad_norm": 1.3250160217285156, |
|
"learning_rate": 7.753333333333334e-06, |
|
"loss": 0.4119, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 133.33333333333334, |
|
"grad_norm": 0.8610659241676331, |
|
"learning_rate": 7.670000000000001e-06, |
|
"loss": 0.4154, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 135.2941176470588, |
|
"grad_norm": 1.3154343366622925, |
|
"learning_rate": 7.586666666666668e-06, |
|
"loss": 0.4141, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 137.2549019607843, |
|
"grad_norm": 0.9761775732040405, |
|
"learning_rate": 7.503333333333333e-06, |
|
"loss": 0.4122, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 137.2549019607843, |
|
"eval_loss": 0.3956769108772278, |
|
"eval_runtime": 6.4498, |
|
"eval_samples_per_second": 27.908, |
|
"eval_steps_per_second": 3.566, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 139.2156862745098, |
|
"grad_norm": 1.142308235168457, |
|
"learning_rate": 7.420000000000001e-06, |
|
"loss": 0.4107, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 141.1764705882353, |
|
"grad_norm": 0.863447368144989, |
|
"learning_rate": 7.336666666666668e-06, |
|
"loss": 0.4093, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 143.13725490196077, |
|
"grad_norm": 0.9665449261665344, |
|
"learning_rate": 7.253333333333335e-06, |
|
"loss": 0.4114, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 145.09803921568627, |
|
"grad_norm": 1.046099305152893, |
|
"learning_rate": 7.17e-06, |
|
"loss": 0.4131, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 147.05882352941177, |
|
"grad_norm": 0.9744811654090881, |
|
"learning_rate": 7.086666666666667e-06, |
|
"loss": 0.4068, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 149.01960784313727, |
|
"grad_norm": 1.3594048023223877, |
|
"learning_rate": 7.003333333333335e-06, |
|
"loss": 0.4086, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 150.98039215686273, |
|
"grad_norm": 0.889851450920105, |
|
"learning_rate": 6.92e-06, |
|
"loss": 0.4098, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 152.94117647058823, |
|
"grad_norm": 0.940967321395874, |
|
"learning_rate": 6.836666666666667e-06, |
|
"loss": 0.4071, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 154.90196078431373, |
|
"grad_norm": 1.5695271492004395, |
|
"learning_rate": 6.753333333333334e-06, |
|
"loss": 0.4097, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 156.86274509803923, |
|
"grad_norm": 1.224783182144165, |
|
"learning_rate": 6.6700000000000005e-06, |
|
"loss": 0.4063, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 156.86274509803923, |
|
"eval_loss": 0.3939879834651947, |
|
"eval_runtime": 6.6684, |
|
"eval_samples_per_second": 26.993, |
|
"eval_steps_per_second": 3.449, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 158.8235294117647, |
|
"grad_norm": 0.8880926370620728, |
|
"learning_rate": 6.5866666666666666e-06, |
|
"loss": 0.4149, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 160.7843137254902, |
|
"grad_norm": 0.8387724757194519, |
|
"learning_rate": 6.5033333333333335e-06, |
|
"loss": 0.4066, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 162.7450980392157, |
|
"grad_norm": 0.9014448523521423, |
|
"learning_rate": 6.42e-06, |
|
"loss": 0.4102, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 164.7058823529412, |
|
"grad_norm": 0.7693892121315002, |
|
"learning_rate": 6.336666666666667e-06, |
|
"loss": 0.4066, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 166.66666666666666, |
|
"grad_norm": 0.9693483114242554, |
|
"learning_rate": 6.253333333333333e-06, |
|
"loss": 0.4084, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 168.62745098039215, |
|
"grad_norm": 0.8914594054222107, |
|
"learning_rate": 6.17e-06, |
|
"loss": 0.4109, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 170.58823529411765, |
|
"grad_norm": 11.76923942565918, |
|
"learning_rate": 6.086666666666667e-06, |
|
"loss": 0.4109, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 172.54901960784315, |
|
"grad_norm": 0.8837347626686096, |
|
"learning_rate": 6.003333333333334e-06, |
|
"loss": 0.4051, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 174.50980392156862, |
|
"grad_norm": 0.8585467338562012, |
|
"learning_rate": 5.92e-06, |
|
"loss": 0.4017, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 176.47058823529412, |
|
"grad_norm": 0.967064380645752, |
|
"learning_rate": 5.836666666666667e-06, |
|
"loss": 0.4028, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 176.47058823529412, |
|
"eval_loss": 0.395079106092453, |
|
"eval_runtime": 6.3984, |
|
"eval_samples_per_second": 28.132, |
|
"eval_steps_per_second": 3.595, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 178.4313725490196, |
|
"grad_norm": 1.2672348022460938, |
|
"learning_rate": 5.753333333333334e-06, |
|
"loss": 0.4068, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 180.3921568627451, |
|
"grad_norm": 0.8008630275726318, |
|
"learning_rate": 5.67e-06, |
|
"loss": 0.4112, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 182.35294117647058, |
|
"grad_norm": 1.0430930852890015, |
|
"learning_rate": 5.586666666666667e-06, |
|
"loss": 0.4145, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 184.31372549019608, |
|
"grad_norm": 0.767160952091217, |
|
"learning_rate": 5.503333333333334e-06, |
|
"loss": 0.3992, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 186.27450980392157, |
|
"grad_norm": 1.0731803178787231, |
|
"learning_rate": 5.420000000000001e-06, |
|
"loss": 0.4025, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 188.23529411764707, |
|
"grad_norm": 0.7984374761581421, |
|
"learning_rate": 5.336666666666667e-06, |
|
"loss": 0.4002, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 190.19607843137254, |
|
"grad_norm": 0.8159019947052002, |
|
"learning_rate": 5.2533333333333336e-06, |
|
"loss": 0.4023, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 192.15686274509804, |
|
"grad_norm": 1.1482937335968018, |
|
"learning_rate": 5.1700000000000005e-06, |
|
"loss": 0.4021, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 194.11764705882354, |
|
"grad_norm": 0.9159001708030701, |
|
"learning_rate": 5.086666666666667e-06, |
|
"loss": 0.4058, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 196.07843137254903, |
|
"grad_norm": 0.8861828446388245, |
|
"learning_rate": 5.0033333333333334e-06, |
|
"loss": 0.3982, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 196.07843137254903, |
|
"eval_loss": 0.3931240439414978, |
|
"eval_runtime": 6.4434, |
|
"eval_samples_per_second": 27.936, |
|
"eval_steps_per_second": 3.57, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 198.0392156862745, |
|
"grad_norm": 0.9328787922859192, |
|
"learning_rate": 4.92e-06, |
|
"loss": 0.405, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 200.0, |
|
"grad_norm": 1.2074756622314453, |
|
"learning_rate": 4.836666666666667e-06, |
|
"loss": 0.3996, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 201.9607843137255, |
|
"grad_norm": 0.7930981516838074, |
|
"learning_rate": 4.753333333333333e-06, |
|
"loss": 0.3994, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 203.92156862745097, |
|
"grad_norm": 0.8685852885246277, |
|
"learning_rate": 4.670000000000001e-06, |
|
"loss": 0.3983, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 205.88235294117646, |
|
"grad_norm": 0.8023931980133057, |
|
"learning_rate": 4.586666666666667e-06, |
|
"loss": 0.4041, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 207.84313725490196, |
|
"grad_norm": 0.7825555801391602, |
|
"learning_rate": 4.503333333333333e-06, |
|
"loss": 0.4012, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 209.80392156862746, |
|
"grad_norm": 1.1338242292404175, |
|
"learning_rate": 4.42e-06, |
|
"loss": 0.3996, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 211.76470588235293, |
|
"grad_norm": 1.3594533205032349, |
|
"learning_rate": 4.336666666666667e-06, |
|
"loss": 0.4022, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 213.72549019607843, |
|
"grad_norm": 1.0148471593856812, |
|
"learning_rate": 4.253333333333334e-06, |
|
"loss": 0.4003, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 215.68627450980392, |
|
"grad_norm": 0.932323157787323, |
|
"learning_rate": 4.17e-06, |
|
"loss": 0.4055, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 215.68627450980392, |
|
"eval_loss": 0.3945559561252594, |
|
"eval_runtime": 6.6419, |
|
"eval_samples_per_second": 27.101, |
|
"eval_steps_per_second": 3.463, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 217.64705882352942, |
|
"grad_norm": 0.7886359095573425, |
|
"learning_rate": 4.086666666666667e-06, |
|
"loss": 0.3938, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 219.6078431372549, |
|
"grad_norm": 1.1809077262878418, |
|
"learning_rate": 4.003333333333334e-06, |
|
"loss": 0.4033, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 221.5686274509804, |
|
"grad_norm": 0.8167886137962341, |
|
"learning_rate": 3.920000000000001e-06, |
|
"loss": 0.4002, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 223.52941176470588, |
|
"grad_norm": 1.1182901859283447, |
|
"learning_rate": 3.836666666666667e-06, |
|
"loss": 0.3982, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 225.49019607843138, |
|
"grad_norm": 1.0442403554916382, |
|
"learning_rate": 3.753333333333334e-06, |
|
"loss": 0.3973, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 227.45098039215685, |
|
"grad_norm": 1.0428043603897095, |
|
"learning_rate": 3.6700000000000004e-06, |
|
"loss": 0.4001, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 229.41176470588235, |
|
"grad_norm": 0.9441725611686707, |
|
"learning_rate": 3.5866666666666673e-06, |
|
"loss": 0.3976, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 231.37254901960785, |
|
"grad_norm": 0.8371317386627197, |
|
"learning_rate": 3.5033333333333334e-06, |
|
"loss": 0.3973, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 233.33333333333334, |
|
"grad_norm": 0.7603981494903564, |
|
"learning_rate": 3.4200000000000007e-06, |
|
"loss": 0.3943, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 235.2941176470588, |
|
"grad_norm": 1.1027510166168213, |
|
"learning_rate": 3.3366666666666668e-06, |
|
"loss": 0.4019, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 235.2941176470588, |
|
"eval_loss": 0.39249518513679504, |
|
"eval_runtime": 6.4221, |
|
"eval_samples_per_second": 28.028, |
|
"eval_steps_per_second": 3.581, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 237.2549019607843, |
|
"grad_norm": 1.0101209878921509, |
|
"learning_rate": 3.2533333333333332e-06, |
|
"loss": 0.396, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 239.2156862745098, |
|
"grad_norm": 1.2180397510528564, |
|
"learning_rate": 3.17e-06, |
|
"loss": 0.3986, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 241.1764705882353, |
|
"grad_norm": 0.9254348874092102, |
|
"learning_rate": 3.0866666666666666e-06, |
|
"loss": 0.397, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 243.13725490196077, |
|
"grad_norm": 1.0118534564971924, |
|
"learning_rate": 3.0033333333333335e-06, |
|
"loss": 0.3974, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 245.09803921568627, |
|
"grad_norm": 0.8282018899917603, |
|
"learning_rate": 2.92e-06, |
|
"loss": 0.3909, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 247.05882352941177, |
|
"grad_norm": 0.8930662870407104, |
|
"learning_rate": 2.836666666666667e-06, |
|
"loss": 0.3986, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 249.01960784313727, |
|
"grad_norm": 1.106566071510315, |
|
"learning_rate": 2.7533333333333334e-06, |
|
"loss": 0.3984, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 250.98039215686273, |
|
"grad_norm": 0.9557139873504639, |
|
"learning_rate": 2.6700000000000003e-06, |
|
"loss": 0.3975, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 252.94117647058823, |
|
"grad_norm": 1.1916723251342773, |
|
"learning_rate": 2.5866666666666667e-06, |
|
"loss": 0.402, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 254.90196078431373, |
|
"grad_norm": 0.8120290637016296, |
|
"learning_rate": 2.5033333333333336e-06, |
|
"loss": 0.4, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 254.90196078431373, |
|
"eval_loss": 0.39403682947158813, |
|
"eval_runtime": 6.3124, |
|
"eval_samples_per_second": 28.515, |
|
"eval_steps_per_second": 3.644, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 256.8627450980392, |
|
"grad_norm": 0.9240416884422302, |
|
"learning_rate": 2.42e-06, |
|
"loss": 0.391, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 258.8235294117647, |
|
"grad_norm": 1.0390872955322266, |
|
"learning_rate": 2.3366666666666666e-06, |
|
"loss": 0.3985, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 260.7843137254902, |
|
"grad_norm": 0.7079731822013855, |
|
"learning_rate": 2.2550000000000004e-06, |
|
"loss": 0.3947, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 262.7450980392157, |
|
"grad_norm": 1.0071649551391602, |
|
"learning_rate": 2.171666666666667e-06, |
|
"loss": 0.4006, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 264.70588235294116, |
|
"grad_norm": 0.9264857769012451, |
|
"learning_rate": 2.088333333333334e-06, |
|
"loss": 0.3983, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 266.6666666666667, |
|
"grad_norm": 0.7591241002082825, |
|
"learning_rate": 2.0050000000000003e-06, |
|
"loss": 0.3995, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 268.62745098039215, |
|
"grad_norm": 0.9408916234970093, |
|
"learning_rate": 1.9216666666666668e-06, |
|
"loss": 0.4013, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 270.5882352941176, |
|
"grad_norm": 0.8653966188430786, |
|
"learning_rate": 1.8383333333333334e-06, |
|
"loss": 0.3948, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 272.54901960784315, |
|
"grad_norm": 1.010793924331665, |
|
"learning_rate": 1.7550000000000001e-06, |
|
"loss": 0.3935, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 274.5098039215686, |
|
"grad_norm": 1.1880955696105957, |
|
"learning_rate": 1.6716666666666666e-06, |
|
"loss": 0.4046, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 274.5098039215686, |
|
"eval_loss": 0.3952997326850891, |
|
"eval_runtime": 6.3393, |
|
"eval_samples_per_second": 28.394, |
|
"eval_steps_per_second": 3.628, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 276.47058823529414, |
|
"grad_norm": 0.9707151651382446, |
|
"learning_rate": 1.5883333333333333e-06, |
|
"loss": 0.4001, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 278.4313725490196, |
|
"grad_norm": 0.7843708395957947, |
|
"learning_rate": 1.505e-06, |
|
"loss": 0.3942, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 280.3921568627451, |
|
"grad_norm": 0.900497555732727, |
|
"learning_rate": 1.4216666666666667e-06, |
|
"loss": 0.3981, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 282.3529411764706, |
|
"grad_norm": 0.7594972848892212, |
|
"learning_rate": 1.3383333333333334e-06, |
|
"loss": 0.3961, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 284.3137254901961, |
|
"grad_norm": 0.7281601428985596, |
|
"learning_rate": 1.255e-06, |
|
"loss": 0.394, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 286.27450980392155, |
|
"grad_norm": 0.7863117456436157, |
|
"learning_rate": 1.1716666666666667e-06, |
|
"loss": 0.3899, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 288.2352941176471, |
|
"grad_norm": 0.7224944233894348, |
|
"learning_rate": 1.0883333333333334e-06, |
|
"loss": 0.3993, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 290.19607843137254, |
|
"grad_norm": 0.7235053181648254, |
|
"learning_rate": 1.0050000000000001e-06, |
|
"loss": 0.3928, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 292.15686274509807, |
|
"grad_norm": 0.7260425686836243, |
|
"learning_rate": 9.216666666666667e-07, |
|
"loss": 0.3904, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 294.11764705882354, |
|
"grad_norm": 0.7799311876296997, |
|
"learning_rate": 8.383333333333334e-07, |
|
"loss": 0.3955, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 294.11764705882354, |
|
"eval_loss": 0.3945452570915222, |
|
"eval_runtime": 6.6937, |
|
"eval_samples_per_second": 26.891, |
|
"eval_steps_per_second": 3.436, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 296.078431372549, |
|
"grad_norm": 0.7973820567131042, |
|
"learning_rate": 7.550000000000001e-07, |
|
"loss": 0.3954, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 298.03921568627453, |
|
"grad_norm": 0.7256256341934204, |
|
"learning_rate": 6.716666666666668e-07, |
|
"loss": 0.3934, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 300.0, |
|
"grad_norm": 1.1149753332138062, |
|
"learning_rate": 5.883333333333333e-07, |
|
"loss": 0.3913, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 301.96078431372547, |
|
"grad_norm": 0.6526250243186951, |
|
"learning_rate": 5.05e-07, |
|
"loss": 0.391, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 303.921568627451, |
|
"grad_norm": 0.8052578568458557, |
|
"learning_rate": 4.2166666666666667e-07, |
|
"loss": 0.3889, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 305.88235294117646, |
|
"grad_norm": 0.7933726906776428, |
|
"learning_rate": 3.3833333333333336e-07, |
|
"loss": 0.3918, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 307.84313725490193, |
|
"grad_norm": 0.7680366635322571, |
|
"learning_rate": 2.55e-07, |
|
"loss": 0.3948, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 309.80392156862746, |
|
"grad_norm": 0.8762979507446289, |
|
"learning_rate": 1.7166666666666668e-07, |
|
"loss": 0.3937, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 311.7647058823529, |
|
"grad_norm": 0.7918867468833923, |
|
"learning_rate": 8.833333333333334e-08, |
|
"loss": 0.3889, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 313.72549019607845, |
|
"grad_norm": 0.7141278982162476, |
|
"learning_rate": 5e-09, |
|
"loss": 0.3944, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 313.72549019607845, |
|
"eval_loss": 0.3929772675037384, |
|
"eval_runtime": 6.3752, |
|
"eval_samples_per_second": 28.234, |
|
"eval_steps_per_second": 3.608, |
|
"step": 8000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 8000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 320, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.70355911863679e+16, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|