ceb_b64_le5_s8000 / last-checkpoint /trainer_state.json
mikhail-panzo's picture
Training in progress, step 8000, checkpoint
0ab6e51 verified
raw
history blame
No virus
31.5 kB
{
"best_metric": 0.39249518513679504,
"best_model_checkpoint": "mikhail-panzo/ceb_b64_le5_s8000/checkpoint-6000",
"epoch": 313.72549019607845,
"eval_steps": 500,
"global_step": 8000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.9607843137254903,
"grad_norm": 2.2592813968658447,
"learning_rate": 2.4500000000000004e-07,
"loss": 0.8071,
"step": 50
},
{
"epoch": 3.9215686274509802,
"grad_norm": 1.564003348350525,
"learning_rate": 4.95e-07,
"loss": 0.7677,
"step": 100
},
{
"epoch": 5.882352941176471,
"grad_norm": 1.3204560279846191,
"learning_rate": 7.450000000000001e-07,
"loss": 0.7469,
"step": 150
},
{
"epoch": 7.8431372549019605,
"grad_norm": 1.3252646923065186,
"learning_rate": 9.950000000000002e-07,
"loss": 0.7234,
"step": 200
},
{
"epoch": 9.803921568627452,
"grad_norm": 1.9049391746520996,
"learning_rate": 1.2450000000000002e-06,
"loss": 0.7167,
"step": 250
},
{
"epoch": 11.764705882352942,
"grad_norm": 1.5868775844573975,
"learning_rate": 1.495e-06,
"loss": 0.6949,
"step": 300
},
{
"epoch": 13.72549019607843,
"grad_norm": 2.022406578063965,
"learning_rate": 1.745e-06,
"loss": 0.6581,
"step": 350
},
{
"epoch": 15.686274509803921,
"grad_norm": 1.601341724395752,
"learning_rate": 1.9950000000000004e-06,
"loss": 0.6181,
"step": 400
},
{
"epoch": 17.647058823529413,
"grad_norm": 1.671892523765564,
"learning_rate": 2.245e-06,
"loss": 0.5798,
"step": 450
},
{
"epoch": 19.607843137254903,
"grad_norm": 1.9054476022720337,
"learning_rate": 2.4950000000000003e-06,
"loss": 0.5525,
"step": 500
},
{
"epoch": 19.607843137254903,
"eval_loss": 0.4685615003108978,
"eval_runtime": 6.3848,
"eval_samples_per_second": 28.192,
"eval_steps_per_second": 3.602,
"step": 500
},
{
"epoch": 21.568627450980394,
"grad_norm": 1.5475994348526,
"learning_rate": 2.7450000000000004e-06,
"loss": 0.5407,
"step": 550
},
{
"epoch": 23.529411764705884,
"grad_norm": 1.1005382537841797,
"learning_rate": 2.995e-06,
"loss": 0.5076,
"step": 600
},
{
"epoch": 25.49019607843137,
"grad_norm": 0.979748547077179,
"learning_rate": 3.2450000000000003e-06,
"loss": 0.5071,
"step": 650
},
{
"epoch": 27.45098039215686,
"grad_norm": 1.0884859561920166,
"learning_rate": 3.495e-06,
"loss": 0.4934,
"step": 700
},
{
"epoch": 29.41176470588235,
"grad_norm": 1.8463184833526611,
"learning_rate": 3.745e-06,
"loss": 0.4919,
"step": 750
},
{
"epoch": 31.372549019607842,
"grad_norm": 1.2262991666793823,
"learning_rate": 3.995000000000001e-06,
"loss": 0.4981,
"step": 800
},
{
"epoch": 33.333333333333336,
"grad_norm": 1.0878300666809082,
"learning_rate": 4.245e-06,
"loss": 0.4928,
"step": 850
},
{
"epoch": 35.294117647058826,
"grad_norm": 1.155561089515686,
"learning_rate": 4.495e-06,
"loss": 0.4848,
"step": 900
},
{
"epoch": 37.254901960784316,
"grad_norm": 1.4933425188064575,
"learning_rate": 4.745e-06,
"loss": 0.4754,
"step": 950
},
{
"epoch": 39.21568627450981,
"grad_norm": 1.239863634109497,
"learning_rate": 4.9950000000000005e-06,
"loss": 0.4756,
"step": 1000
},
{
"epoch": 39.21568627450981,
"eval_loss": 0.42764008045196533,
"eval_runtime": 6.3802,
"eval_samples_per_second": 28.212,
"eval_steps_per_second": 3.605,
"step": 1000
},
{
"epoch": 41.1764705882353,
"grad_norm": 1.9258134365081787,
"learning_rate": 5.245e-06,
"loss": 0.4653,
"step": 1050
},
{
"epoch": 43.13725490196079,
"grad_norm": 0.9576085805892944,
"learning_rate": 5.495000000000001e-06,
"loss": 0.465,
"step": 1100
},
{
"epoch": 45.09803921568628,
"grad_norm": 0.9215015769004822,
"learning_rate": 5.745000000000001e-06,
"loss": 0.4743,
"step": 1150
},
{
"epoch": 47.05882352941177,
"grad_norm": 1.1917026042938232,
"learning_rate": 5.995000000000001e-06,
"loss": 0.4581,
"step": 1200
},
{
"epoch": 49.01960784313726,
"grad_norm": 1.30499267578125,
"learning_rate": 6.245000000000001e-06,
"loss": 0.4626,
"step": 1250
},
{
"epoch": 50.98039215686274,
"grad_norm": 1.4004439115524292,
"learning_rate": 6.4950000000000005e-06,
"loss": 0.4591,
"step": 1300
},
{
"epoch": 52.94117647058823,
"grad_norm": 0.892646074295044,
"learning_rate": 6.745000000000001e-06,
"loss": 0.4545,
"step": 1350
},
{
"epoch": 54.90196078431372,
"grad_norm": 1.0985666513442993,
"learning_rate": 6.995000000000001e-06,
"loss": 0.4501,
"step": 1400
},
{
"epoch": 56.86274509803921,
"grad_norm": 1.271141529083252,
"learning_rate": 7.245000000000001e-06,
"loss": 0.4456,
"step": 1450
},
{
"epoch": 58.8235294117647,
"grad_norm": 1.170074224472046,
"learning_rate": 7.495000000000001e-06,
"loss": 0.4543,
"step": 1500
},
{
"epoch": 58.8235294117647,
"eval_loss": 0.4116212725639343,
"eval_runtime": 6.2794,
"eval_samples_per_second": 28.665,
"eval_steps_per_second": 3.663,
"step": 1500
},
{
"epoch": 60.78431372549019,
"grad_norm": 0.9767690896987915,
"learning_rate": 7.745e-06,
"loss": 0.4517,
"step": 1550
},
{
"epoch": 62.745098039215684,
"grad_norm": 1.4193260669708252,
"learning_rate": 7.995e-06,
"loss": 0.4537,
"step": 1600
},
{
"epoch": 64.70588235294117,
"grad_norm": 1.3294360637664795,
"learning_rate": 8.245000000000002e-06,
"loss": 0.4435,
"step": 1650
},
{
"epoch": 66.66666666666667,
"grad_norm": 0.8386899828910828,
"learning_rate": 8.495e-06,
"loss": 0.4507,
"step": 1700
},
{
"epoch": 68.62745098039215,
"grad_norm": 1.0917119979858398,
"learning_rate": 8.745000000000002e-06,
"loss": 0.4409,
"step": 1750
},
{
"epoch": 70.58823529411765,
"grad_norm": 1.0725489854812622,
"learning_rate": 8.995000000000001e-06,
"loss": 0.4449,
"step": 1800
},
{
"epoch": 72.54901960784314,
"grad_norm": 1.3506999015808105,
"learning_rate": 9.245e-06,
"loss": 0.4496,
"step": 1850
},
{
"epoch": 74.50980392156863,
"grad_norm": 0.9701379537582397,
"learning_rate": 9.495000000000001e-06,
"loss": 0.4384,
"step": 1900
},
{
"epoch": 76.47058823529412,
"grad_norm": 1.7079219818115234,
"learning_rate": 9.745e-06,
"loss": 0.4374,
"step": 1950
},
{
"epoch": 78.43137254901961,
"grad_norm": 1.87998628616333,
"learning_rate": 9.995000000000002e-06,
"loss": 0.4346,
"step": 2000
},
{
"epoch": 78.43137254901961,
"eval_loss": 0.4027920663356781,
"eval_runtime": 6.3546,
"eval_samples_per_second": 28.326,
"eval_steps_per_second": 3.619,
"step": 2000
},
{
"epoch": 80.3921568627451,
"grad_norm": 1.1510419845581055,
"learning_rate": 9.918333333333335e-06,
"loss": 0.4326,
"step": 2050
},
{
"epoch": 82.3529411764706,
"grad_norm": 1.2605654001235962,
"learning_rate": 9.835000000000002e-06,
"loss": 0.4355,
"step": 2100
},
{
"epoch": 84.31372549019608,
"grad_norm": 0.866606593132019,
"learning_rate": 9.751666666666667e-06,
"loss": 0.4286,
"step": 2150
},
{
"epoch": 86.27450980392157,
"grad_norm": 2.0733227729797363,
"learning_rate": 9.668333333333334e-06,
"loss": 0.4365,
"step": 2200
},
{
"epoch": 88.23529411764706,
"grad_norm": 0.9726402759552002,
"learning_rate": 9.585e-06,
"loss": 0.4367,
"step": 2250
},
{
"epoch": 90.19607843137256,
"grad_norm": 1.0713222026824951,
"learning_rate": 9.501666666666667e-06,
"loss": 0.4288,
"step": 2300
},
{
"epoch": 92.15686274509804,
"grad_norm": 1.5218483209609985,
"learning_rate": 9.418333333333334e-06,
"loss": 0.435,
"step": 2350
},
{
"epoch": 94.11764705882354,
"grad_norm": 0.8391968011856079,
"learning_rate": 9.335000000000001e-06,
"loss": 0.431,
"step": 2400
},
{
"epoch": 96.07843137254902,
"grad_norm": 1.3989890813827515,
"learning_rate": 9.251666666666668e-06,
"loss": 0.4251,
"step": 2450
},
{
"epoch": 98.03921568627452,
"grad_norm": 0.9168123006820679,
"learning_rate": 9.168333333333333e-06,
"loss": 0.4292,
"step": 2500
},
{
"epoch": 98.03921568627452,
"eval_loss": 0.3997121751308441,
"eval_runtime": 6.4036,
"eval_samples_per_second": 28.109,
"eval_steps_per_second": 3.592,
"step": 2500
},
{
"epoch": 100.0,
"grad_norm": 2.315229892730713,
"learning_rate": 9.085e-06,
"loss": 0.4295,
"step": 2550
},
{
"epoch": 101.96078431372548,
"grad_norm": 1.0224589109420776,
"learning_rate": 9.001666666666667e-06,
"loss": 0.4212,
"step": 2600
},
{
"epoch": 103.92156862745098,
"grad_norm": 1.389236330986023,
"learning_rate": 8.918333333333334e-06,
"loss": 0.4215,
"step": 2650
},
{
"epoch": 105.88235294117646,
"grad_norm": 0.9601902961730957,
"learning_rate": 8.836666666666668e-06,
"loss": 0.4271,
"step": 2700
},
{
"epoch": 107.84313725490196,
"grad_norm": 0.8070810437202454,
"learning_rate": 8.753333333333333e-06,
"loss": 0.4269,
"step": 2750
},
{
"epoch": 109.80392156862744,
"grad_norm": 1.3389467000961304,
"learning_rate": 8.67e-06,
"loss": 0.4209,
"step": 2800
},
{
"epoch": 111.76470588235294,
"grad_norm": 1.0083783864974976,
"learning_rate": 8.586666666666667e-06,
"loss": 0.4143,
"step": 2850
},
{
"epoch": 113.72549019607843,
"grad_norm": 1.390769600868225,
"learning_rate": 8.503333333333334e-06,
"loss": 0.4208,
"step": 2900
},
{
"epoch": 115.68627450980392,
"grad_norm": 1.0967605113983154,
"learning_rate": 8.42e-06,
"loss": 0.4127,
"step": 2950
},
{
"epoch": 117.6470588235294,
"grad_norm": 0.9284217357635498,
"learning_rate": 8.336666666666668e-06,
"loss": 0.4166,
"step": 3000
},
{
"epoch": 117.6470588235294,
"eval_loss": 0.395207017660141,
"eval_runtime": 6.3246,
"eval_samples_per_second": 28.46,
"eval_steps_per_second": 3.637,
"step": 3000
},
{
"epoch": 119.6078431372549,
"grad_norm": 1.1663891077041626,
"learning_rate": 8.253333333333334e-06,
"loss": 0.4155,
"step": 3050
},
{
"epoch": 121.56862745098039,
"grad_norm": 5.156078338623047,
"learning_rate": 8.17e-06,
"loss": 0.4156,
"step": 3100
},
{
"epoch": 123.52941176470588,
"grad_norm": 0.9861869215965271,
"learning_rate": 8.086666666666667e-06,
"loss": 0.4163,
"step": 3150
},
{
"epoch": 125.49019607843137,
"grad_norm": 0.8449072241783142,
"learning_rate": 8.003333333333334e-06,
"loss": 0.4176,
"step": 3200
},
{
"epoch": 127.45098039215686,
"grad_norm": 1.300947904586792,
"learning_rate": 7.92e-06,
"loss": 0.4245,
"step": 3250
},
{
"epoch": 129.41176470588235,
"grad_norm": 0.7830358147621155,
"learning_rate": 7.836666666666667e-06,
"loss": 0.4124,
"step": 3300
},
{
"epoch": 131.37254901960785,
"grad_norm": 1.3250160217285156,
"learning_rate": 7.753333333333334e-06,
"loss": 0.4119,
"step": 3350
},
{
"epoch": 133.33333333333334,
"grad_norm": 0.8610659241676331,
"learning_rate": 7.670000000000001e-06,
"loss": 0.4154,
"step": 3400
},
{
"epoch": 135.2941176470588,
"grad_norm": 1.3154343366622925,
"learning_rate": 7.586666666666668e-06,
"loss": 0.4141,
"step": 3450
},
{
"epoch": 137.2549019607843,
"grad_norm": 0.9761775732040405,
"learning_rate": 7.503333333333333e-06,
"loss": 0.4122,
"step": 3500
},
{
"epoch": 137.2549019607843,
"eval_loss": 0.3956769108772278,
"eval_runtime": 6.4498,
"eval_samples_per_second": 27.908,
"eval_steps_per_second": 3.566,
"step": 3500
},
{
"epoch": 139.2156862745098,
"grad_norm": 1.142308235168457,
"learning_rate": 7.420000000000001e-06,
"loss": 0.4107,
"step": 3550
},
{
"epoch": 141.1764705882353,
"grad_norm": 0.863447368144989,
"learning_rate": 7.336666666666668e-06,
"loss": 0.4093,
"step": 3600
},
{
"epoch": 143.13725490196077,
"grad_norm": 0.9665449261665344,
"learning_rate": 7.253333333333335e-06,
"loss": 0.4114,
"step": 3650
},
{
"epoch": 145.09803921568627,
"grad_norm": 1.046099305152893,
"learning_rate": 7.17e-06,
"loss": 0.4131,
"step": 3700
},
{
"epoch": 147.05882352941177,
"grad_norm": 0.9744811654090881,
"learning_rate": 7.086666666666667e-06,
"loss": 0.4068,
"step": 3750
},
{
"epoch": 149.01960784313727,
"grad_norm": 1.3594048023223877,
"learning_rate": 7.003333333333335e-06,
"loss": 0.4086,
"step": 3800
},
{
"epoch": 150.98039215686273,
"grad_norm": 0.889851450920105,
"learning_rate": 6.92e-06,
"loss": 0.4098,
"step": 3850
},
{
"epoch": 152.94117647058823,
"grad_norm": 0.940967321395874,
"learning_rate": 6.836666666666667e-06,
"loss": 0.4071,
"step": 3900
},
{
"epoch": 154.90196078431373,
"grad_norm": 1.5695271492004395,
"learning_rate": 6.753333333333334e-06,
"loss": 0.4097,
"step": 3950
},
{
"epoch": 156.86274509803923,
"grad_norm": 1.224783182144165,
"learning_rate": 6.6700000000000005e-06,
"loss": 0.4063,
"step": 4000
},
{
"epoch": 156.86274509803923,
"eval_loss": 0.3939879834651947,
"eval_runtime": 6.6684,
"eval_samples_per_second": 26.993,
"eval_steps_per_second": 3.449,
"step": 4000
},
{
"epoch": 158.8235294117647,
"grad_norm": 0.8880926370620728,
"learning_rate": 6.5866666666666666e-06,
"loss": 0.4149,
"step": 4050
},
{
"epoch": 160.7843137254902,
"grad_norm": 0.8387724757194519,
"learning_rate": 6.5033333333333335e-06,
"loss": 0.4066,
"step": 4100
},
{
"epoch": 162.7450980392157,
"grad_norm": 0.9014448523521423,
"learning_rate": 6.42e-06,
"loss": 0.4102,
"step": 4150
},
{
"epoch": 164.7058823529412,
"grad_norm": 0.7693892121315002,
"learning_rate": 6.336666666666667e-06,
"loss": 0.4066,
"step": 4200
},
{
"epoch": 166.66666666666666,
"grad_norm": 0.9693483114242554,
"learning_rate": 6.253333333333333e-06,
"loss": 0.4084,
"step": 4250
},
{
"epoch": 168.62745098039215,
"grad_norm": 0.8914594054222107,
"learning_rate": 6.17e-06,
"loss": 0.4109,
"step": 4300
},
{
"epoch": 170.58823529411765,
"grad_norm": 11.76923942565918,
"learning_rate": 6.086666666666667e-06,
"loss": 0.4109,
"step": 4350
},
{
"epoch": 172.54901960784315,
"grad_norm": 0.8837347626686096,
"learning_rate": 6.003333333333334e-06,
"loss": 0.4051,
"step": 4400
},
{
"epoch": 174.50980392156862,
"grad_norm": 0.8585467338562012,
"learning_rate": 5.92e-06,
"loss": 0.4017,
"step": 4450
},
{
"epoch": 176.47058823529412,
"grad_norm": 0.967064380645752,
"learning_rate": 5.836666666666667e-06,
"loss": 0.4028,
"step": 4500
},
{
"epoch": 176.47058823529412,
"eval_loss": 0.395079106092453,
"eval_runtime": 6.3984,
"eval_samples_per_second": 28.132,
"eval_steps_per_second": 3.595,
"step": 4500
},
{
"epoch": 178.4313725490196,
"grad_norm": 1.2672348022460938,
"learning_rate": 5.753333333333334e-06,
"loss": 0.4068,
"step": 4550
},
{
"epoch": 180.3921568627451,
"grad_norm": 0.8008630275726318,
"learning_rate": 5.67e-06,
"loss": 0.4112,
"step": 4600
},
{
"epoch": 182.35294117647058,
"grad_norm": 1.0430930852890015,
"learning_rate": 5.586666666666667e-06,
"loss": 0.4145,
"step": 4650
},
{
"epoch": 184.31372549019608,
"grad_norm": 0.767160952091217,
"learning_rate": 5.503333333333334e-06,
"loss": 0.3992,
"step": 4700
},
{
"epoch": 186.27450980392157,
"grad_norm": 1.0731803178787231,
"learning_rate": 5.420000000000001e-06,
"loss": 0.4025,
"step": 4750
},
{
"epoch": 188.23529411764707,
"grad_norm": 0.7984374761581421,
"learning_rate": 5.336666666666667e-06,
"loss": 0.4002,
"step": 4800
},
{
"epoch": 190.19607843137254,
"grad_norm": 0.8159019947052002,
"learning_rate": 5.2533333333333336e-06,
"loss": 0.4023,
"step": 4850
},
{
"epoch": 192.15686274509804,
"grad_norm": 1.1482937335968018,
"learning_rate": 5.1700000000000005e-06,
"loss": 0.4021,
"step": 4900
},
{
"epoch": 194.11764705882354,
"grad_norm": 0.9159001708030701,
"learning_rate": 5.086666666666667e-06,
"loss": 0.4058,
"step": 4950
},
{
"epoch": 196.07843137254903,
"grad_norm": 0.8861828446388245,
"learning_rate": 5.0033333333333334e-06,
"loss": 0.3982,
"step": 5000
},
{
"epoch": 196.07843137254903,
"eval_loss": 0.3931240439414978,
"eval_runtime": 6.4434,
"eval_samples_per_second": 27.936,
"eval_steps_per_second": 3.57,
"step": 5000
},
{
"epoch": 198.0392156862745,
"grad_norm": 0.9328787922859192,
"learning_rate": 4.92e-06,
"loss": 0.405,
"step": 5050
},
{
"epoch": 200.0,
"grad_norm": 1.2074756622314453,
"learning_rate": 4.836666666666667e-06,
"loss": 0.3996,
"step": 5100
},
{
"epoch": 201.9607843137255,
"grad_norm": 0.7930981516838074,
"learning_rate": 4.753333333333333e-06,
"loss": 0.3994,
"step": 5150
},
{
"epoch": 203.92156862745097,
"grad_norm": 0.8685852885246277,
"learning_rate": 4.670000000000001e-06,
"loss": 0.3983,
"step": 5200
},
{
"epoch": 205.88235294117646,
"grad_norm": 0.8023931980133057,
"learning_rate": 4.586666666666667e-06,
"loss": 0.4041,
"step": 5250
},
{
"epoch": 207.84313725490196,
"grad_norm": 0.7825555801391602,
"learning_rate": 4.503333333333333e-06,
"loss": 0.4012,
"step": 5300
},
{
"epoch": 209.80392156862746,
"grad_norm": 1.1338242292404175,
"learning_rate": 4.42e-06,
"loss": 0.3996,
"step": 5350
},
{
"epoch": 211.76470588235293,
"grad_norm": 1.3594533205032349,
"learning_rate": 4.336666666666667e-06,
"loss": 0.4022,
"step": 5400
},
{
"epoch": 213.72549019607843,
"grad_norm": 1.0148471593856812,
"learning_rate": 4.253333333333334e-06,
"loss": 0.4003,
"step": 5450
},
{
"epoch": 215.68627450980392,
"grad_norm": 0.932323157787323,
"learning_rate": 4.17e-06,
"loss": 0.4055,
"step": 5500
},
{
"epoch": 215.68627450980392,
"eval_loss": 0.3945559561252594,
"eval_runtime": 6.6419,
"eval_samples_per_second": 27.101,
"eval_steps_per_second": 3.463,
"step": 5500
},
{
"epoch": 217.64705882352942,
"grad_norm": 0.7886359095573425,
"learning_rate": 4.086666666666667e-06,
"loss": 0.3938,
"step": 5550
},
{
"epoch": 219.6078431372549,
"grad_norm": 1.1809077262878418,
"learning_rate": 4.003333333333334e-06,
"loss": 0.4033,
"step": 5600
},
{
"epoch": 221.5686274509804,
"grad_norm": 0.8167886137962341,
"learning_rate": 3.920000000000001e-06,
"loss": 0.4002,
"step": 5650
},
{
"epoch": 223.52941176470588,
"grad_norm": 1.1182901859283447,
"learning_rate": 3.836666666666667e-06,
"loss": 0.3982,
"step": 5700
},
{
"epoch": 225.49019607843138,
"grad_norm": 1.0442403554916382,
"learning_rate": 3.753333333333334e-06,
"loss": 0.3973,
"step": 5750
},
{
"epoch": 227.45098039215685,
"grad_norm": 1.0428043603897095,
"learning_rate": 3.6700000000000004e-06,
"loss": 0.4001,
"step": 5800
},
{
"epoch": 229.41176470588235,
"grad_norm": 0.9441725611686707,
"learning_rate": 3.5866666666666673e-06,
"loss": 0.3976,
"step": 5850
},
{
"epoch": 231.37254901960785,
"grad_norm": 0.8371317386627197,
"learning_rate": 3.5033333333333334e-06,
"loss": 0.3973,
"step": 5900
},
{
"epoch": 233.33333333333334,
"grad_norm": 0.7603981494903564,
"learning_rate": 3.4200000000000007e-06,
"loss": 0.3943,
"step": 5950
},
{
"epoch": 235.2941176470588,
"grad_norm": 1.1027510166168213,
"learning_rate": 3.3366666666666668e-06,
"loss": 0.4019,
"step": 6000
},
{
"epoch": 235.2941176470588,
"eval_loss": 0.39249518513679504,
"eval_runtime": 6.4221,
"eval_samples_per_second": 28.028,
"eval_steps_per_second": 3.581,
"step": 6000
},
{
"epoch": 237.2549019607843,
"grad_norm": 1.0101209878921509,
"learning_rate": 3.2533333333333332e-06,
"loss": 0.396,
"step": 6050
},
{
"epoch": 239.2156862745098,
"grad_norm": 1.2180397510528564,
"learning_rate": 3.17e-06,
"loss": 0.3986,
"step": 6100
},
{
"epoch": 241.1764705882353,
"grad_norm": 0.9254348874092102,
"learning_rate": 3.0866666666666666e-06,
"loss": 0.397,
"step": 6150
},
{
"epoch": 243.13725490196077,
"grad_norm": 1.0118534564971924,
"learning_rate": 3.0033333333333335e-06,
"loss": 0.3974,
"step": 6200
},
{
"epoch": 245.09803921568627,
"grad_norm": 0.8282018899917603,
"learning_rate": 2.92e-06,
"loss": 0.3909,
"step": 6250
},
{
"epoch": 247.05882352941177,
"grad_norm": 0.8930662870407104,
"learning_rate": 2.836666666666667e-06,
"loss": 0.3986,
"step": 6300
},
{
"epoch": 249.01960784313727,
"grad_norm": 1.106566071510315,
"learning_rate": 2.7533333333333334e-06,
"loss": 0.3984,
"step": 6350
},
{
"epoch": 250.98039215686273,
"grad_norm": 0.9557139873504639,
"learning_rate": 2.6700000000000003e-06,
"loss": 0.3975,
"step": 6400
},
{
"epoch": 252.94117647058823,
"grad_norm": 1.1916723251342773,
"learning_rate": 2.5866666666666667e-06,
"loss": 0.402,
"step": 6450
},
{
"epoch": 254.90196078431373,
"grad_norm": 0.8120290637016296,
"learning_rate": 2.5033333333333336e-06,
"loss": 0.4,
"step": 6500
},
{
"epoch": 254.90196078431373,
"eval_loss": 0.39403682947158813,
"eval_runtime": 6.3124,
"eval_samples_per_second": 28.515,
"eval_steps_per_second": 3.644,
"step": 6500
},
{
"epoch": 256.8627450980392,
"grad_norm": 0.9240416884422302,
"learning_rate": 2.42e-06,
"loss": 0.391,
"step": 6550
},
{
"epoch": 258.8235294117647,
"grad_norm": 1.0390872955322266,
"learning_rate": 2.3366666666666666e-06,
"loss": 0.3985,
"step": 6600
},
{
"epoch": 260.7843137254902,
"grad_norm": 0.7079731822013855,
"learning_rate": 2.2550000000000004e-06,
"loss": 0.3947,
"step": 6650
},
{
"epoch": 262.7450980392157,
"grad_norm": 1.0071649551391602,
"learning_rate": 2.171666666666667e-06,
"loss": 0.4006,
"step": 6700
},
{
"epoch": 264.70588235294116,
"grad_norm": 0.9264857769012451,
"learning_rate": 2.088333333333334e-06,
"loss": 0.3983,
"step": 6750
},
{
"epoch": 266.6666666666667,
"grad_norm": 0.7591241002082825,
"learning_rate": 2.0050000000000003e-06,
"loss": 0.3995,
"step": 6800
},
{
"epoch": 268.62745098039215,
"grad_norm": 0.9408916234970093,
"learning_rate": 1.9216666666666668e-06,
"loss": 0.4013,
"step": 6850
},
{
"epoch": 270.5882352941176,
"grad_norm": 0.8653966188430786,
"learning_rate": 1.8383333333333334e-06,
"loss": 0.3948,
"step": 6900
},
{
"epoch": 272.54901960784315,
"grad_norm": 1.010793924331665,
"learning_rate": 1.7550000000000001e-06,
"loss": 0.3935,
"step": 6950
},
{
"epoch": 274.5098039215686,
"grad_norm": 1.1880955696105957,
"learning_rate": 1.6716666666666666e-06,
"loss": 0.4046,
"step": 7000
},
{
"epoch": 274.5098039215686,
"eval_loss": 0.3952997326850891,
"eval_runtime": 6.3393,
"eval_samples_per_second": 28.394,
"eval_steps_per_second": 3.628,
"step": 7000
},
{
"epoch": 276.47058823529414,
"grad_norm": 0.9707151651382446,
"learning_rate": 1.5883333333333333e-06,
"loss": 0.4001,
"step": 7050
},
{
"epoch": 278.4313725490196,
"grad_norm": 0.7843708395957947,
"learning_rate": 1.505e-06,
"loss": 0.3942,
"step": 7100
},
{
"epoch": 280.3921568627451,
"grad_norm": 0.900497555732727,
"learning_rate": 1.4216666666666667e-06,
"loss": 0.3981,
"step": 7150
},
{
"epoch": 282.3529411764706,
"grad_norm": 0.7594972848892212,
"learning_rate": 1.3383333333333334e-06,
"loss": 0.3961,
"step": 7200
},
{
"epoch": 284.3137254901961,
"grad_norm": 0.7281601428985596,
"learning_rate": 1.255e-06,
"loss": 0.394,
"step": 7250
},
{
"epoch": 286.27450980392155,
"grad_norm": 0.7863117456436157,
"learning_rate": 1.1716666666666667e-06,
"loss": 0.3899,
"step": 7300
},
{
"epoch": 288.2352941176471,
"grad_norm": 0.7224944233894348,
"learning_rate": 1.0883333333333334e-06,
"loss": 0.3993,
"step": 7350
},
{
"epoch": 290.19607843137254,
"grad_norm": 0.7235053181648254,
"learning_rate": 1.0050000000000001e-06,
"loss": 0.3928,
"step": 7400
},
{
"epoch": 292.15686274509807,
"grad_norm": 0.7260425686836243,
"learning_rate": 9.216666666666667e-07,
"loss": 0.3904,
"step": 7450
},
{
"epoch": 294.11764705882354,
"grad_norm": 0.7799311876296997,
"learning_rate": 8.383333333333334e-07,
"loss": 0.3955,
"step": 7500
},
{
"epoch": 294.11764705882354,
"eval_loss": 0.3945452570915222,
"eval_runtime": 6.6937,
"eval_samples_per_second": 26.891,
"eval_steps_per_second": 3.436,
"step": 7500
},
{
"epoch": 296.078431372549,
"grad_norm": 0.7973820567131042,
"learning_rate": 7.550000000000001e-07,
"loss": 0.3954,
"step": 7550
},
{
"epoch": 298.03921568627453,
"grad_norm": 0.7256256341934204,
"learning_rate": 6.716666666666668e-07,
"loss": 0.3934,
"step": 7600
},
{
"epoch": 300.0,
"grad_norm": 1.1149753332138062,
"learning_rate": 5.883333333333333e-07,
"loss": 0.3913,
"step": 7650
},
{
"epoch": 301.96078431372547,
"grad_norm": 0.6526250243186951,
"learning_rate": 5.05e-07,
"loss": 0.391,
"step": 7700
},
{
"epoch": 303.921568627451,
"grad_norm": 0.8052578568458557,
"learning_rate": 4.2166666666666667e-07,
"loss": 0.3889,
"step": 7750
},
{
"epoch": 305.88235294117646,
"grad_norm": 0.7933726906776428,
"learning_rate": 3.3833333333333336e-07,
"loss": 0.3918,
"step": 7800
},
{
"epoch": 307.84313725490193,
"grad_norm": 0.7680366635322571,
"learning_rate": 2.55e-07,
"loss": 0.3948,
"step": 7850
},
{
"epoch": 309.80392156862746,
"grad_norm": 0.8762979507446289,
"learning_rate": 1.7166666666666668e-07,
"loss": 0.3937,
"step": 7900
},
{
"epoch": 311.7647058823529,
"grad_norm": 0.7918867468833923,
"learning_rate": 8.833333333333334e-08,
"loss": 0.3889,
"step": 7950
},
{
"epoch": 313.72549019607845,
"grad_norm": 0.7141278982162476,
"learning_rate": 5e-09,
"loss": 0.3944,
"step": 8000
},
{
"epoch": 313.72549019607845,
"eval_loss": 0.3929772675037384,
"eval_runtime": 6.3752,
"eval_samples_per_second": 28.234,
"eval_steps_per_second": 3.608,
"step": 8000
}
],
"logging_steps": 50,
"max_steps": 8000,
"num_input_tokens_seen": 0,
"num_train_epochs": 320,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.70355911863679e+16,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}