|
{ |
|
"best_metric": 0.41427454352378845, |
|
"best_model_checkpoint": "mikhail_panzo/ceb_b128_le3_s8000/checkpoint-500", |
|
"epoch": 627.4509803921569, |
|
"eval_steps": 500, |
|
"global_step": 8000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 3.9215686274509802, |
|
"grad_norm": 2.899599313735962, |
|
"learning_rate": 2.4500000000000003e-05, |
|
"loss": 0.7166, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 7.8431372549019605, |
|
"grad_norm": 3.100567102432251, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 0.5261, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 11.764705882352942, |
|
"grad_norm": 3.0574705600738525, |
|
"learning_rate": 7.45e-05, |
|
"loss": 0.4841, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 15.686274509803921, |
|
"grad_norm": 1.0856355428695679, |
|
"learning_rate": 9.95e-05, |
|
"loss": 0.4634, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 19.607843137254903, |
|
"grad_norm": 3.049023389816284, |
|
"learning_rate": 0.0001245, |
|
"loss": 0.4622, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 23.529411764705884, |
|
"grad_norm": 1.3187408447265625, |
|
"learning_rate": 0.0001495, |
|
"loss": 0.4491, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 27.45098039215686, |
|
"grad_norm": 2.879023551940918, |
|
"learning_rate": 0.00017449999999999999, |
|
"loss": 0.4391, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 31.372549019607842, |
|
"grad_norm": 0.908079981803894, |
|
"learning_rate": 0.00019950000000000002, |
|
"loss": 0.4446, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 35.294117647058826, |
|
"grad_norm": 2.698949098587036, |
|
"learning_rate": 0.0002245, |
|
"loss": 0.4475, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 39.21568627450981, |
|
"grad_norm": 4.266517639160156, |
|
"learning_rate": 0.0002495, |
|
"loss": 0.4379, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 39.21568627450981, |
|
"eval_loss": 0.41427454352378845, |
|
"eval_runtime": 6.5876, |
|
"eval_samples_per_second": 27.324, |
|
"eval_steps_per_second": 3.491, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 43.13725490196079, |
|
"grad_norm": 2.933168649673462, |
|
"learning_rate": 0.0002745, |
|
"loss": 0.4306, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 47.05882352941177, |
|
"grad_norm": 2.7896311283111572, |
|
"learning_rate": 0.0002995, |
|
"loss": 0.4502, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 50.98039215686274, |
|
"grad_norm": 2.2012181282043457, |
|
"learning_rate": 0.00032450000000000003, |
|
"loss": 0.4493, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 54.90196078431372, |
|
"grad_norm": 2.1830649375915527, |
|
"learning_rate": 0.0003495, |
|
"loss": 0.4231, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 58.8235294117647, |
|
"grad_norm": 2.837082862854004, |
|
"learning_rate": 0.0003745, |
|
"loss": 0.4237, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 62.745098039215684, |
|
"grad_norm": 2.12803316116333, |
|
"learning_rate": 0.0003995, |
|
"loss": 0.4527, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 66.66666666666667, |
|
"grad_norm": 7.096790790557861, |
|
"learning_rate": 0.0004245, |
|
"loss": 0.4536, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 70.58823529411765, |
|
"grad_norm": 2.911712884902954, |
|
"learning_rate": 0.00044950000000000003, |
|
"loss": 0.4573, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 74.50980392156863, |
|
"grad_norm": 3.439180374145508, |
|
"learning_rate": 0.0004745, |
|
"loss": 0.4726, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 78.43137254901961, |
|
"grad_norm": 4.071909427642822, |
|
"learning_rate": 0.0004995, |
|
"loss": 0.4492, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 78.43137254901961, |
|
"eval_loss": 0.4376925528049469, |
|
"eval_runtime": 6.6325, |
|
"eval_samples_per_second": 27.139, |
|
"eval_steps_per_second": 3.468, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 82.3529411764706, |
|
"grad_norm": 6.268165111541748, |
|
"learning_rate": 0.0005245, |
|
"loss": 0.4562, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 86.27450980392157, |
|
"grad_norm": 2.190829038619995, |
|
"learning_rate": 0.0005495, |
|
"loss": 0.4473, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 90.19607843137256, |
|
"grad_norm": 6.477546215057373, |
|
"learning_rate": 0.0005745, |
|
"loss": 0.4587, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 94.11764705882354, |
|
"grad_norm": 2.3519718647003174, |
|
"learning_rate": 0.0005995000000000001, |
|
"loss": 0.4677, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 98.03921568627452, |
|
"grad_norm": 3.0339012145996094, |
|
"learning_rate": 0.0006245000000000001, |
|
"loss": 0.5425, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 101.96078431372548, |
|
"grad_norm": 5.318001747131348, |
|
"learning_rate": 0.0006490000000000001, |
|
"loss": 0.9214, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 105.88235294117646, |
|
"grad_norm": 0.10876531153917313, |
|
"learning_rate": 0.000674, |
|
"loss": 1.5183, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 109.80392156862744, |
|
"grad_norm": 1.2959858179092407, |
|
"learning_rate": 0.000699, |
|
"loss": 1.497, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 113.72549019607843, |
|
"grad_norm": 0.14063771069049835, |
|
"learning_rate": 0.000724, |
|
"loss": 1.4423, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 117.6470588235294, |
|
"grad_norm": 0.3507501184940338, |
|
"learning_rate": 0.000749, |
|
"loss": 1.4417, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 117.6470588235294, |
|
"eval_loss": 1.4372303485870361, |
|
"eval_runtime": 6.5947, |
|
"eval_samples_per_second": 27.295, |
|
"eval_steps_per_second": 3.488, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 121.56862745098039, |
|
"grad_norm": 0.041443753987550735, |
|
"learning_rate": 0.0007740000000000001, |
|
"loss": 1.4335, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 125.49019607843137, |
|
"grad_norm": 0.12605535984039307, |
|
"learning_rate": 0.000799, |
|
"loss": 1.434, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 129.41176470588235, |
|
"grad_norm": 0.22777751088142395, |
|
"learning_rate": 0.000824, |
|
"loss": 1.4312, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 133.33333333333334, |
|
"grad_norm": 0.317199170589447, |
|
"learning_rate": 0.000849, |
|
"loss": 1.4323, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 137.2549019607843, |
|
"grad_norm": 0.10237721353769302, |
|
"learning_rate": 0.000874, |
|
"loss": 1.4332, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 141.1764705882353, |
|
"grad_norm": 0.23147189617156982, |
|
"learning_rate": 0.0008990000000000001, |
|
"loss": 1.4313, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 145.09803921568627, |
|
"grad_norm": 0.1493494212627411, |
|
"learning_rate": 0.000924, |
|
"loss": 1.4446, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 149.01960784313727, |
|
"grad_norm": 0.10309738665819168, |
|
"learning_rate": 0.000949, |
|
"loss": 1.4315, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 152.94117647058823, |
|
"grad_norm": 0.1577301323413849, |
|
"learning_rate": 0.000974, |
|
"loss": 1.4305, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 156.86274509803923, |
|
"grad_norm": 0.14740918576717377, |
|
"learning_rate": 0.000999, |
|
"loss": 1.4311, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 156.86274509803923, |
|
"eval_loss": 1.4307715892791748, |
|
"eval_runtime": 6.5817, |
|
"eval_samples_per_second": 27.349, |
|
"eval_steps_per_second": 3.495, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 160.7843137254902, |
|
"grad_norm": 0.04897089675068855, |
|
"learning_rate": 0.000992, |
|
"loss": 1.4303, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 164.7058823529412, |
|
"grad_norm": 0.052775364369153976, |
|
"learning_rate": 0.0009836666666666668, |
|
"loss": 1.4293, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 168.62745098039215, |
|
"grad_norm": 0.07608811557292938, |
|
"learning_rate": 0.0009753333333333334, |
|
"loss": 1.4302, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 172.54901960784315, |
|
"grad_norm": 0.18155619502067566, |
|
"learning_rate": 0.000967, |
|
"loss": 1.4298, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 176.47058823529412, |
|
"grad_norm": 0.23861084878444672, |
|
"learning_rate": 0.0009586666666666667, |
|
"loss": 1.4492, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 180.3921568627451, |
|
"grad_norm": 0.07681425660848618, |
|
"learning_rate": 0.0009503333333333334, |
|
"loss": 1.4352, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 184.31372549019608, |
|
"grad_norm": 0.1355891227722168, |
|
"learning_rate": 0.000942, |
|
"loss": 1.4308, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 188.23529411764707, |
|
"grad_norm": 0.17761075496673584, |
|
"learning_rate": 0.0009336666666666666, |
|
"loss": 1.4678, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 192.15686274509804, |
|
"grad_norm": 0.10840369015932083, |
|
"learning_rate": 0.0009253333333333333, |
|
"loss": 1.4288, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 196.07843137254903, |
|
"grad_norm": 0.15987902879714966, |
|
"learning_rate": 0.0009170000000000001, |
|
"loss": 1.4506, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 196.07843137254903, |
|
"eval_loss": 1.4364666938781738, |
|
"eval_runtime": 6.5311, |
|
"eval_samples_per_second": 27.561, |
|
"eval_steps_per_second": 3.522, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 200.0, |
|
"grad_norm": 0.056504905223846436, |
|
"learning_rate": 0.0009086666666666667, |
|
"loss": 1.4315, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 203.92156862745097, |
|
"grad_norm": 0.07093175500631332, |
|
"learning_rate": 0.0009003333333333334, |
|
"loss": 1.4302, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 207.84313725490196, |
|
"grad_norm": 0.04637598991394043, |
|
"learning_rate": 0.000892, |
|
"loss": 1.4306, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 211.76470588235293, |
|
"grad_norm": 0.062212761491537094, |
|
"learning_rate": 0.0008836666666666667, |
|
"loss": 1.4296, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 215.68627450980392, |
|
"grad_norm": 0.1276828944683075, |
|
"learning_rate": 0.0008753333333333333, |
|
"loss": 1.4433, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 219.6078431372549, |
|
"grad_norm": 0.07146938890218735, |
|
"learning_rate": 0.000867, |
|
"loss": 1.4309, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 223.52941176470588, |
|
"grad_norm": 0.10493505001068115, |
|
"learning_rate": 0.0008586666666666668, |
|
"loss": 1.43, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 227.45098039215685, |
|
"grad_norm": 0.11434385925531387, |
|
"learning_rate": 0.0008503333333333334, |
|
"loss": 1.4276, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 231.37254901960785, |
|
"grad_norm": 0.16153867542743683, |
|
"learning_rate": 0.000842, |
|
"loss": 1.4288, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 235.2941176470588, |
|
"grad_norm": 0.14236119389533997, |
|
"learning_rate": 0.0008336666666666667, |
|
"loss": 1.4286, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 235.2941176470588, |
|
"eval_loss": 1.4312187433242798, |
|
"eval_runtime": 6.6382, |
|
"eval_samples_per_second": 27.116, |
|
"eval_steps_per_second": 3.465, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 239.2156862745098, |
|
"grad_norm": 0.08121176809072495, |
|
"learning_rate": 0.0008253333333333334, |
|
"loss": 1.426, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 243.13725490196077, |
|
"grad_norm": 0.04294486716389656, |
|
"learning_rate": 0.000817, |
|
"loss": 1.4278, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 247.05882352941177, |
|
"grad_norm": 0.09214005619287491, |
|
"learning_rate": 0.0008086666666666666, |
|
"loss": 1.4283, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 250.98039215686273, |
|
"grad_norm": 0.06561414152383804, |
|
"learning_rate": 0.0008003333333333333, |
|
"loss": 1.4286, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 254.90196078431373, |
|
"grad_norm": 0.06078485772013664, |
|
"learning_rate": 0.0007920000000000001, |
|
"loss": 1.4275, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 258.8235294117647, |
|
"grad_norm": 0.15776388347148895, |
|
"learning_rate": 0.0007836666666666667, |
|
"loss": 1.4296, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 262.7450980392157, |
|
"grad_norm": 0.07666509598493576, |
|
"learning_rate": 0.0007753333333333334, |
|
"loss": 1.4264, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 266.6666666666667, |
|
"grad_norm": 0.12861433625221252, |
|
"learning_rate": 0.000767, |
|
"loss": 1.4301, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 270.5882352941176, |
|
"grad_norm": 0.058193549513816833, |
|
"learning_rate": 0.0007586666666666667, |
|
"loss": 1.4288, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 274.5098039215686, |
|
"grad_norm": 0.2083207666873932, |
|
"learning_rate": 0.0007503333333333333, |
|
"loss": 1.4285, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 274.5098039215686, |
|
"eval_loss": 1.4328523874282837, |
|
"eval_runtime": 6.505, |
|
"eval_samples_per_second": 27.671, |
|
"eval_steps_per_second": 3.536, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 278.4313725490196, |
|
"grad_norm": 0.1663893461227417, |
|
"learning_rate": 0.000742, |
|
"loss": 1.4281, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 282.3529411764706, |
|
"grad_norm": 0.034747958183288574, |
|
"learning_rate": 0.0007336666666666668, |
|
"loss": 1.4278, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 286.27450980392155, |
|
"grad_norm": 0.0872250571846962, |
|
"learning_rate": 0.0007253333333333334, |
|
"loss": 1.4258, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 290.19607843137254, |
|
"grad_norm": 0.08757229149341583, |
|
"learning_rate": 0.000717, |
|
"loss": 1.4224, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 294.11764705882354, |
|
"grad_norm": 0.1288028061389923, |
|
"learning_rate": 0.0007086666666666667, |
|
"loss": 1.425, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 298.03921568627453, |
|
"grad_norm": 0.08947344869375229, |
|
"learning_rate": 0.0007003333333333334, |
|
"loss": 1.4219, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 301.96078431372547, |
|
"grad_norm": 0.10822741687297821, |
|
"learning_rate": 0.000692, |
|
"loss": 1.4217, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 305.88235294117646, |
|
"grad_norm": 0.08467403054237366, |
|
"learning_rate": 0.0006836666666666666, |
|
"loss": 1.423, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 309.80392156862746, |
|
"grad_norm": 0.06203661486506462, |
|
"learning_rate": 0.0006753333333333333, |
|
"loss": 1.4229, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 313.72549019607845, |
|
"grad_norm": 0.11116231977939606, |
|
"learning_rate": 0.0006670000000000001, |
|
"loss": 1.421, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 313.72549019607845, |
|
"eval_loss": 1.4406124353408813, |
|
"eval_runtime": 6.7269, |
|
"eval_samples_per_second": 26.758, |
|
"eval_steps_per_second": 3.419, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 317.6470588235294, |
|
"grad_norm": 0.22143186628818512, |
|
"learning_rate": 0.0006586666666666667, |
|
"loss": 1.4416, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 321.5686274509804, |
|
"grad_norm": 0.04169169440865517, |
|
"learning_rate": 0.0006503333333333334, |
|
"loss": 1.4239, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 325.4901960784314, |
|
"grad_norm": 0.09313623607158661, |
|
"learning_rate": 0.000642, |
|
"loss": 1.4219, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 329.4117647058824, |
|
"grad_norm": 0.1355171650648117, |
|
"learning_rate": 0.0006336666666666667, |
|
"loss": 1.422, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 333.3333333333333, |
|
"grad_norm": 0.11701258271932602, |
|
"learning_rate": 0.0006253333333333333, |
|
"loss": 1.4288, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 337.2549019607843, |
|
"grad_norm": 0.20015889406204224, |
|
"learning_rate": 0.000617, |
|
"loss": 1.423, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 341.1764705882353, |
|
"grad_norm": 0.061756107956171036, |
|
"learning_rate": 0.0006086666666666668, |
|
"loss": 1.4219, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 345.0980392156863, |
|
"grad_norm": 0.10823976993560791, |
|
"learning_rate": 0.0006003333333333334, |
|
"loss": 1.4222, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 349.01960784313724, |
|
"grad_norm": 0.13286006450653076, |
|
"learning_rate": 0.000592, |
|
"loss": 1.4292, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 352.94117647058823, |
|
"grad_norm": 0.10281714797019958, |
|
"learning_rate": 0.0005836666666666667, |
|
"loss": 1.4228, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 352.94117647058823, |
|
"eval_loss": 1.433663249015808, |
|
"eval_runtime": 6.7809, |
|
"eval_samples_per_second": 26.545, |
|
"eval_steps_per_second": 3.392, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 356.8627450980392, |
|
"grad_norm": 0.045487239956855774, |
|
"learning_rate": 0.0005753333333333334, |
|
"loss": 1.4237, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 360.7843137254902, |
|
"grad_norm": 0.08421700447797775, |
|
"learning_rate": 0.000567, |
|
"loss": 1.4216, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 364.70588235294116, |
|
"grad_norm": 0.03885773941874504, |
|
"learning_rate": 0.0005586666666666666, |
|
"loss": 1.423, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 368.62745098039215, |
|
"grad_norm": 0.06695468723773956, |
|
"learning_rate": 0.0005503333333333333, |
|
"loss": 1.4227, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 372.54901960784315, |
|
"grad_norm": 0.059906505048274994, |
|
"learning_rate": 0.0005420000000000001, |
|
"loss": 1.4216, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 376.47058823529414, |
|
"grad_norm": 0.13581004738807678, |
|
"learning_rate": 0.0005336666666666667, |
|
"loss": 1.4241, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 380.3921568627451, |
|
"grad_norm": 0.10568855702877045, |
|
"learning_rate": 0.0005253333333333334, |
|
"loss": 1.4219, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 384.3137254901961, |
|
"grad_norm": 0.16822956502437592, |
|
"learning_rate": 0.000517, |
|
"loss": 1.4224, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 388.2352941176471, |
|
"grad_norm": 0.11007729917764664, |
|
"learning_rate": 0.0005086666666666667, |
|
"loss": 1.4222, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 392.15686274509807, |
|
"grad_norm": 0.06487352401018143, |
|
"learning_rate": 0.0005003333333333333, |
|
"loss": 1.4232, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 392.15686274509807, |
|
"eval_loss": 1.4328006505966187, |
|
"eval_runtime": 6.65, |
|
"eval_samples_per_second": 27.068, |
|
"eval_steps_per_second": 3.459, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 396.078431372549, |
|
"grad_norm": 0.06124414503574371, |
|
"learning_rate": 0.000492, |
|
"loss": 1.422, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 400.0, |
|
"grad_norm": 0.15419355034828186, |
|
"learning_rate": 0.0004836666666666667, |
|
"loss": 1.4212, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 403.921568627451, |
|
"grad_norm": 0.032941777259111404, |
|
"learning_rate": 0.00047533333333333336, |
|
"loss": 1.4223, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 407.84313725490193, |
|
"grad_norm": 0.1344718337059021, |
|
"learning_rate": 0.000467, |
|
"loss": 1.4223, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 411.7647058823529, |
|
"grad_norm": 0.08072862774133682, |
|
"learning_rate": 0.0004586666666666667, |
|
"loss": 1.4202, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 415.6862745098039, |
|
"grad_norm": 0.10441293567419052, |
|
"learning_rate": 0.0004503333333333333, |
|
"loss": 1.4219, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 419.6078431372549, |
|
"grad_norm": 0.17147202789783478, |
|
"learning_rate": 0.000442, |
|
"loss": 1.4218, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 423.52941176470586, |
|
"grad_norm": 0.0913294106721878, |
|
"learning_rate": 0.0004336666666666667, |
|
"loss": 1.4205, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 427.45098039215685, |
|
"grad_norm": 0.08024486899375916, |
|
"learning_rate": 0.00042533333333333334, |
|
"loss": 1.42, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 431.37254901960785, |
|
"grad_norm": 0.20594315230846405, |
|
"learning_rate": 0.000417, |
|
"loss": 1.4208, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 431.37254901960785, |
|
"eval_loss": 1.4330966472625732, |
|
"eval_runtime": 6.5131, |
|
"eval_samples_per_second": 27.637, |
|
"eval_steps_per_second": 3.531, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 435.29411764705884, |
|
"grad_norm": 0.16393162310123444, |
|
"learning_rate": 0.00040866666666666666, |
|
"loss": 1.4215, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 439.2156862745098, |
|
"grad_norm": 0.12962013483047485, |
|
"learning_rate": 0.0004003333333333333, |
|
"loss": 1.4221, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 443.1372549019608, |
|
"grad_norm": 0.09659314155578613, |
|
"learning_rate": 0.00039200000000000004, |
|
"loss": 1.4197, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 447.05882352941177, |
|
"grad_norm": 0.05516098812222481, |
|
"learning_rate": 0.00038366666666666665, |
|
"loss": 1.4214, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 450.98039215686276, |
|
"grad_norm": 0.08098474889993668, |
|
"learning_rate": 0.00037533333333333337, |
|
"loss": 1.4224, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 454.9019607843137, |
|
"grad_norm": 0.20328585803508759, |
|
"learning_rate": 0.000367, |
|
"loss": 1.4215, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 458.8235294117647, |
|
"grad_norm": 0.04339490830898285, |
|
"learning_rate": 0.0003586666666666667, |
|
"loss": 1.4214, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 462.7450980392157, |
|
"grad_norm": 0.0780976340174675, |
|
"learning_rate": 0.00035033333333333336, |
|
"loss": 1.4215, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 466.6666666666667, |
|
"grad_norm": 0.09538757055997849, |
|
"learning_rate": 0.000342, |
|
"loss": 1.4206, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 470.5882352941176, |
|
"grad_norm": 0.10228724032640457, |
|
"learning_rate": 0.0003336666666666667, |
|
"loss": 1.4211, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 470.5882352941176, |
|
"eval_loss": 1.4311357736587524, |
|
"eval_runtime": 6.8021, |
|
"eval_samples_per_second": 26.462, |
|
"eval_steps_per_second": 3.381, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 474.5098039215686, |
|
"grad_norm": 0.05785190686583519, |
|
"learning_rate": 0.0003253333333333333, |
|
"loss": 1.4195, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 478.4313725490196, |
|
"grad_norm": 0.040976058691740036, |
|
"learning_rate": 0.000317, |
|
"loss": 1.4218, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 482.3529411764706, |
|
"grad_norm": 0.0426504947245121, |
|
"learning_rate": 0.00030866666666666667, |
|
"loss": 1.4218, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 486.27450980392155, |
|
"grad_norm": 0.03735409677028656, |
|
"learning_rate": 0.00030033333333333333, |
|
"loss": 1.4196, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 490.19607843137254, |
|
"grad_norm": 0.12306160479784012, |
|
"learning_rate": 0.000292, |
|
"loss": 1.4222, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 494.11764705882354, |
|
"grad_norm": 0.028285862877964973, |
|
"learning_rate": 0.00028366666666666666, |
|
"loss": 1.4212, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 498.03921568627453, |
|
"grad_norm": 0.13171572983264923, |
|
"learning_rate": 0.0002753333333333333, |
|
"loss": 1.4211, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 501.96078431372547, |
|
"grad_norm": 0.2669251561164856, |
|
"learning_rate": 0.00026700000000000004, |
|
"loss": 1.4206, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 505.88235294117646, |
|
"grad_norm": 0.10635057091712952, |
|
"learning_rate": 0.00025866666666666665, |
|
"loss": 1.4198, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 509.80392156862746, |
|
"grad_norm": 0.04717053100466728, |
|
"learning_rate": 0.00025033333333333336, |
|
"loss": 1.4204, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 509.80392156862746, |
|
"eval_loss": 1.4344011545181274, |
|
"eval_runtime": 6.7898, |
|
"eval_samples_per_second": 26.51, |
|
"eval_steps_per_second": 3.387, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 513.7254901960785, |
|
"grad_norm": 0.028606118634343147, |
|
"learning_rate": 0.000242, |
|
"loss": 1.4203, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 517.6470588235294, |
|
"grad_norm": 0.07845211774110794, |
|
"learning_rate": 0.00023366666666666666, |
|
"loss": 1.42, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 521.5686274509804, |
|
"grad_norm": 0.07225753366947174, |
|
"learning_rate": 0.00022533333333333333, |
|
"loss": 1.4197, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 525.4901960784314, |
|
"grad_norm": 0.038016363978385925, |
|
"learning_rate": 0.00021700000000000002, |
|
"loss": 1.4199, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 529.4117647058823, |
|
"grad_norm": 0.06095300614833832, |
|
"learning_rate": 0.00020866666666666668, |
|
"loss": 1.421, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 533.3333333333334, |
|
"grad_norm": 0.07751613110303879, |
|
"learning_rate": 0.00020033333333333334, |
|
"loss": 1.4212, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 537.2549019607843, |
|
"grad_norm": 0.048244625329971313, |
|
"learning_rate": 0.000192, |
|
"loss": 1.4171, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 541.1764705882352, |
|
"grad_norm": 0.041657689958810806, |
|
"learning_rate": 0.00018366666666666667, |
|
"loss": 1.4206, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 545.0980392156863, |
|
"grad_norm": 0.01942858286201954, |
|
"learning_rate": 0.00017533333333333336, |
|
"loss": 1.4202, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 549.0196078431372, |
|
"grad_norm": 0.0659833624958992, |
|
"learning_rate": 0.00016700000000000002, |
|
"loss": 1.4196, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 549.0196078431372, |
|
"eval_loss": 1.4362238645553589, |
|
"eval_runtime": 6.7359, |
|
"eval_samples_per_second": 26.722, |
|
"eval_steps_per_second": 3.415, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 552.9411764705883, |
|
"grad_norm": 0.06083720177412033, |
|
"learning_rate": 0.00015866666666666668, |
|
"loss": 1.42, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 556.8627450980392, |
|
"grad_norm": 0.07482324540615082, |
|
"learning_rate": 0.00015033333333333335, |
|
"loss": 1.4184, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 560.7843137254902, |
|
"grad_norm": 0.05342550203204155, |
|
"learning_rate": 0.00014199999999999998, |
|
"loss": 1.4186, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 564.7058823529412, |
|
"grad_norm": 0.09053777158260345, |
|
"learning_rate": 0.00013366666666666667, |
|
"loss": 1.4191, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 568.6274509803922, |
|
"grad_norm": 0.08072460442781448, |
|
"learning_rate": 0.00012533333333333334, |
|
"loss": 1.42, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 572.5490196078431, |
|
"grad_norm": 0.09836099296808243, |
|
"learning_rate": 0.00011700000000000001, |
|
"loss": 1.4193, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 576.4705882352941, |
|
"grad_norm": 0.05738500505685806, |
|
"learning_rate": 0.00010866666666666666, |
|
"loss": 1.42, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 580.3921568627451, |
|
"grad_norm": 0.06691340357065201, |
|
"learning_rate": 0.00010033333333333334, |
|
"loss": 1.4185, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 584.3137254901961, |
|
"grad_norm": 0.06185409054160118, |
|
"learning_rate": 9.2e-05, |
|
"loss": 1.4172, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 588.2352941176471, |
|
"grad_norm": 0.14216652512550354, |
|
"learning_rate": 8.366666666666666e-05, |
|
"loss": 1.4194, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 588.2352941176471, |
|
"eval_loss": 1.4361063241958618, |
|
"eval_runtime": 6.733, |
|
"eval_samples_per_second": 26.734, |
|
"eval_steps_per_second": 3.416, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 592.156862745098, |
|
"grad_norm": 0.027007540687918663, |
|
"learning_rate": 7.533333333333334e-05, |
|
"loss": 1.4173, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 596.0784313725491, |
|
"grad_norm": 0.04154467582702637, |
|
"learning_rate": 6.7e-05, |
|
"loss": 1.4186, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 600.0, |
|
"grad_norm": 0.10418181121349335, |
|
"learning_rate": 5.8666666666666665e-05, |
|
"loss": 1.4187, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 603.9215686274509, |
|
"grad_norm": 0.041870325803756714, |
|
"learning_rate": 5.0333333333333335e-05, |
|
"loss": 1.4188, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 607.843137254902, |
|
"grad_norm": 0.08349625766277313, |
|
"learning_rate": 4.2000000000000004e-05, |
|
"loss": 1.4191, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 611.7647058823529, |
|
"grad_norm": 0.08984719216823578, |
|
"learning_rate": 3.366666666666667e-05, |
|
"loss": 1.4174, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 615.6862745098039, |
|
"grad_norm": 0.028009561821818352, |
|
"learning_rate": 2.5333333333333334e-05, |
|
"loss": 1.4203, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 619.6078431372549, |
|
"grad_norm": 0.025170741602778435, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 1.4183, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 623.5294117647059, |
|
"grad_norm": 0.08974526822566986, |
|
"learning_rate": 8.666666666666666e-06, |
|
"loss": 1.4177, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 627.4509803921569, |
|
"grad_norm": 0.023904943838715553, |
|
"learning_rate": 3.3333333333333335e-07, |
|
"loss": 1.4184, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 627.4509803921569, |
|
"eval_loss": 1.4349356889724731, |
|
"eval_runtime": 6.7417, |
|
"eval_samples_per_second": 26.7, |
|
"eval_steps_per_second": 3.412, |
|
"step": 8000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 8000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 667, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7404667537759085e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|