|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 100, |
|
"global_step": 1312, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007621951219512195, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.5151515151515152e-06, |
|
"loss": 1.6918, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01524390243902439, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 3.0303030303030305e-06, |
|
"loss": 1.6227, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.022865853658536585, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 1.6367, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03048780487804878, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 6.060606060606061e-06, |
|
"loss": 1.493, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.038109756097560975, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 7.5757575757575764e-06, |
|
"loss": 1.441, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04573170731707317, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 1.3488, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.053353658536585365, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.0606060606060606e-05, |
|
"loss": 1.3289, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06097560975609756, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.2121212121212122e-05, |
|
"loss": 1.2742, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06859756097560976, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 1.2758, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07621951219512195, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.5151515151515153e-05, |
|
"loss": 1.2707, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08384146341463415, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.273, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09146341463414634, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 1.2414, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09908536585365854, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.96969696969697e-05, |
|
"loss": 1.275, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.10670731707317073, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.986440677966102e-05, |
|
"loss": 1.2547, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.11432926829268293, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.969491525423729e-05, |
|
"loss": 1.2391, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12195121951219512, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.9525423728813562e-05, |
|
"loss": 1.2379, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.12957317073170732, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.9355932203389832e-05, |
|
"loss": 1.2105, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.13719512195121952, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.9186440677966102e-05, |
|
"loss": 1.2359, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1448170731707317, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.9016949152542375e-05, |
|
"loss": 1.2867, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1524390243902439, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.8847457627118645e-05, |
|
"loss": 1.1762, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1600609756097561, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.8677966101694918e-05, |
|
"loss": 1.1992, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1676829268292683, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.8508474576271188e-05, |
|
"loss": 1.232, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.17530487804878048, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.8338983050847458e-05, |
|
"loss": 1.2172, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.18292682926829268, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.816949152542373e-05, |
|
"loss": 1.182, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.19054878048780488, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.2492, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.19817073170731708, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.7830508474576274e-05, |
|
"loss": 1.2312, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.20579268292682926, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.7661016949152543e-05, |
|
"loss": 1.241, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.21341463414634146, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.7491525423728813e-05, |
|
"loss": 1.1879, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.22103658536585366, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.7322033898305086e-05, |
|
"loss": 1.2211, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.22865853658536586, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.715254237288136e-05, |
|
"loss": 1.2152, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.23628048780487804, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.698305084745763e-05, |
|
"loss": 1.1938, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.24390243902439024, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.68135593220339e-05, |
|
"loss": 1.2266, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.25152439024390244, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.6644067796610172e-05, |
|
"loss": 1.1672, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.25914634146341464, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.6474576271186442e-05, |
|
"loss": 1.2104, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.26676829268292684, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.6305084745762715e-05, |
|
"loss": 1.2098, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.27439024390243905, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.6135593220338985e-05, |
|
"loss": 1.1684, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2820121951219512, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.5966101694915255e-05, |
|
"loss": 1.1734, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2896341463414634, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.5796610169491528e-05, |
|
"loss": 1.2008, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2972560975609756, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.5627118644067798e-05, |
|
"loss": 1.1848, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3048780487804878, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.545762711864407e-05, |
|
"loss": 1.2033, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.528813559322034e-05, |
|
"loss": 1.1781, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3201219512195122, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.511864406779661e-05, |
|
"loss": 1.2043, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3277439024390244, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.4949152542372882e-05, |
|
"loss": 1.1699, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3353658536585366, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.4779661016949153e-05, |
|
"loss": 1.1871, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3429878048780488, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.4610169491525426e-05, |
|
"loss": 1.1418, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.35060975609756095, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.4440677966101698e-05, |
|
"loss": 1.1824, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.35823170731707316, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.4271186440677966e-05, |
|
"loss": 1.176, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.36585365853658536, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.4101694915254239e-05, |
|
"loss": 1.1785, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.37347560975609756, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.393220338983051e-05, |
|
"loss": 1.1758, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.38109756097560976, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.3762711864406782e-05, |
|
"loss": 1.1988, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.38871951219512196, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.3593220338983053e-05, |
|
"loss": 1.1822, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.39634146341463417, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.3423728813559323e-05, |
|
"loss": 1.1801, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.40396341463414637, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.3254237288135595e-05, |
|
"loss": 1.2043, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4115853658536585, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.3084745762711866e-05, |
|
"loss": 1.2098, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.4192073170731707, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.2915254237288137e-05, |
|
"loss": 1.1973, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4268292682926829, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.2745762711864407e-05, |
|
"loss": 1.2066, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4344512195121951, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.2576271186440679e-05, |
|
"loss": 1.1543, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.4420731707317073, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.240677966101695e-05, |
|
"loss": 1.1703, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.4496951219512195, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 1.2237288135593222e-05, |
|
"loss": 1.1566, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.4573170731707317, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.2067796610169493e-05, |
|
"loss": 1.1623, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4649390243902439, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.1898305084745763e-05, |
|
"loss": 1.198, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.4725609756097561, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.1728813559322034e-05, |
|
"loss": 1.1971, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.4801829268292683, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.1559322033898306e-05, |
|
"loss": 1.159, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.1389830508474577e-05, |
|
"loss": 1.184, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.4954268292682927, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.1220338983050849e-05, |
|
"loss": 1.1859, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5030487804878049, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.1050847457627118e-05, |
|
"loss": 1.1434, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.510670731707317, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.088135593220339e-05, |
|
"loss": 1.1775, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5182926829268293, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.0711864406779661e-05, |
|
"loss": 1.15, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5259146341463414, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.0542372881355933e-05, |
|
"loss": 1.1613, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5335365853658537, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.0372881355932204e-05, |
|
"loss": 1.1602, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5411585365853658, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.0203389830508474e-05, |
|
"loss": 1.1879, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.5487804878048781, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.0033898305084746e-05, |
|
"loss": 1.1941, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5564024390243902, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 9.864406779661017e-06, |
|
"loss": 1.1574, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5640243902439024, |
|
"grad_norm": 2.75, |
|
"learning_rate": 9.69491525423729e-06, |
|
"loss": 1.1793, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5716463414634146, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 9.52542372881356e-06, |
|
"loss": 1.1711, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5792682926829268, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 9.355932203389831e-06, |
|
"loss": 1.1875, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.586890243902439, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 9.186440677966101e-06, |
|
"loss": 1.19, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5945121951219512, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 9.016949152542374e-06, |
|
"loss": 1.141, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6021341463414634, |
|
"grad_norm": 2.75, |
|
"learning_rate": 8.847457627118646e-06, |
|
"loss": 1.1699, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6097560975609756, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 8.677966101694915e-06, |
|
"loss": 1.1809, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6173780487804879, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 8.508474576271187e-06, |
|
"loss": 1.1824, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 8.338983050847458e-06, |
|
"loss": 1.1729, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6326219512195121, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 8.16949152542373e-06, |
|
"loss": 1.1113, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.6402439024390244, |
|
"grad_norm": 2.5, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.1434, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6478658536585366, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 7.830508474576271e-06, |
|
"loss": 1.1643, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.6554878048780488, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 7.661016949152543e-06, |
|
"loss": 1.1738, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.663109756097561, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 7.491525423728814e-06, |
|
"loss": 1.125, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.6707317073170732, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 7.3220338983050855e-06, |
|
"loss": 1.1723, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.6783536585365854, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 7.152542372881357e-06, |
|
"loss": 1.1617, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.6859756097560976, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 6.9830508474576275e-06, |
|
"loss": 1.1445, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6935975609756098, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 6.813559322033899e-06, |
|
"loss": 1.1807, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7012195121951219, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 6.64406779661017e-06, |
|
"loss": 1.1865, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7088414634146342, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 6.474576271186441e-06, |
|
"loss": 1.198, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7164634146341463, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 6.3050847457627125e-06, |
|
"loss": 1.1734, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.7240853658536586, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 6.135593220338983e-06, |
|
"loss": 1.1633, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.7317073170731707, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.9661016949152555e-06, |
|
"loss": 1.1953, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.739329268292683, |
|
"grad_norm": 2.5, |
|
"learning_rate": 5.796610169491525e-06, |
|
"loss": 1.1879, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.7469512195121951, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 5.6271186440677975e-06, |
|
"loss": 1.1664, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7545731707317073, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 5.457627118644067e-06, |
|
"loss": 1.1789, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.7621951219512195, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 5.28813559322034e-06, |
|
"loss": 1.1738, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7698170731707317, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 5.118644067796611e-06, |
|
"loss": 1.1582, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.7774390243902439, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 4.949152542372882e-06, |
|
"loss": 1.1391, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.7850609756097561, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 4.779661016949153e-06, |
|
"loss": 1.1652, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.7926829268292683, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 4.610169491525424e-06, |
|
"loss": 1.1426, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.8003048780487805, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 4.440677966101695e-06, |
|
"loss": 1.1713, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8079268292682927, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.271186440677967e-06, |
|
"loss": 1.1727, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.8155487804878049, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.101694915254237e-06, |
|
"loss": 1.1695, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.823170731707317, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 3.932203389830509e-06, |
|
"loss": 1.1672, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.8307926829268293, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 3.76271186440678e-06, |
|
"loss": 1.2016, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.8384146341463414, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 3.5932203389830512e-06, |
|
"loss": 1.1727, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8460365853658537, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 3.4237288135593223e-06, |
|
"loss": 1.1799, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.8536585365853658, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 3.2542372881355933e-06, |
|
"loss": 1.1555, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.8612804878048781, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 3.0847457627118648e-06, |
|
"loss": 1.1678, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.8689024390243902, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.915254237288136e-06, |
|
"loss": 1.1578, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.8765243902439024, |
|
"grad_norm": 2.375, |
|
"learning_rate": 2.745762711864407e-06, |
|
"loss": 1.1535, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.8841463414634146, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 2.576271186440678e-06, |
|
"loss": 1.1602, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.8917682926829268, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 2.4067796610169493e-06, |
|
"loss": 1.1852, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.899390243902439, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.2372881355932204e-06, |
|
"loss": 1.1605, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.9070121951219512, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.0677966101694914e-06, |
|
"loss": 1.1451, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.9146341463414634, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.8983050847457629e-06, |
|
"loss": 1.1496, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.9222560975609756, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.728813559322034e-06, |
|
"loss": 1.1582, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.9298780487804879, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.5593220338983054e-06, |
|
"loss": 1.227, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.3898305084745764e-06, |
|
"loss": 1.1977, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.9451219512195121, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.2203389830508477e-06, |
|
"loss": 1.1885, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.9527439024390244, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.0508474576271187e-06, |
|
"loss": 1.1549, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.9603658536585366, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 8.813559322033899e-07, |
|
"loss": 1.1781, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.9679878048780488, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 7.118644067796611e-07, |
|
"loss": 1.1957, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.975609756097561, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 5.423728813559322e-07, |
|
"loss": 1.1898, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.9832317073170732, |
|
"grad_norm": 2.625, |
|
"learning_rate": 3.7288135593220347e-07, |
|
"loss": 1.1531, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.9908536585365854, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 2.0338983050847458e-07, |
|
"loss": 1.1773, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9984756097560976, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 3.3898305084745764e-08, |
|
"loss": 1.0754, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 1312, |
|
"total_flos": 4.7710598211200614e+17, |
|
"train_loss": 1.2026635146722562, |
|
"train_runtime": 2875.3707, |
|
"train_samples_per_second": 14.591, |
|
"train_steps_per_second": 0.456 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1312, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.7710598211200614e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|