|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 760, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013157894736842105, |
|
"grad_norm": 251.0, |
|
"learning_rate": 2.631578947368421e-06, |
|
"loss": 35.372, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06578947368421052, |
|
"grad_norm": 239.0, |
|
"learning_rate": 1.3157894736842106e-05, |
|
"loss": 35.3482, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.13157894736842105, |
|
"grad_norm": 83.5, |
|
"learning_rate": 2.6315789473684212e-05, |
|
"loss": 30.5543, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.19736842105263158, |
|
"grad_norm": 37.5, |
|
"learning_rate": 3.9473684210526316e-05, |
|
"loss": 23.0935, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2631578947368421, |
|
"grad_norm": 17.875, |
|
"learning_rate": 5.2631578947368424e-05, |
|
"loss": 18.9215, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.32894736842105265, |
|
"grad_norm": 11.125, |
|
"learning_rate": 6.578947368421054e-05, |
|
"loss": 16.7379, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.39473684210526316, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 7.894736842105263e-05, |
|
"loss": 14.9012, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4605263157894737, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 9.210526315789474e-05, |
|
"loss": 14.2412, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 4.0, |
|
"learning_rate": 0.00010526315789473685, |
|
"loss": 13.5992, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5921052631578947, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 0.00011842105263157894, |
|
"loss": 12.7389, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6578947368421053, |
|
"grad_norm": 11.875, |
|
"learning_rate": 0.00013157894736842108, |
|
"loss": 11.4753, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7236842105263158, |
|
"grad_norm": 19.25, |
|
"learning_rate": 0.00014473684210526317, |
|
"loss": 8.7885, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.7894736842105263, |
|
"grad_norm": 20.75, |
|
"learning_rate": 0.00015789473684210527, |
|
"loss": 4.8341, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8552631578947368, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 0.00017105263157894739, |
|
"loss": 2.389, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.9210526315789473, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 0.00018421052631578948, |
|
"loss": 1.9682, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9868421052631579, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00019736842105263157, |
|
"loss": 1.6901, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.9966416358947754, |
|
"eval_runtime": 0.2381, |
|
"eval_samples_per_second": 41.997, |
|
"eval_steps_per_second": 4.2, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00019998312416333227, |
|
"loss": 1.565, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.118421052631579, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0001999145758387301, |
|
"loss": 1.4481, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.1842105263157894, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00019979333640833947, |
|
"loss": 1.3767, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00019961946980917456, |
|
"loss": 1.3246, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.3157894736842106, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00019939306773179497, |
|
"loss": 1.2875, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.381578947368421, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00019911424957195158, |
|
"loss": 1.244, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.4473684210526316, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00019878316236762196, |
|
"loss": 1.2335, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.513157894736842, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.000198399980721468, |
|
"loss": 1.2052, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0001979649067087574, |
|
"loss": 1.1933, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.6447368421052633, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00019747816977079671, |
|
"loss": 1.1718, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.7105263157894737, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.00019694002659393305, |
|
"loss": 1.1603, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.776315789473684, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00019635076097418734, |
|
"loss": 1.1664, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.8421052631578947, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00019571068366759143, |
|
"loss": 1.1523, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.9078947368421053, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00019502013222630712, |
|
"loss": 1.1372, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.973684210526316, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00019427947082061432, |
|
"loss": 1.1272, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.6070284843444824, |
|
"eval_runtime": 0.2379, |
|
"eval_samples_per_second": 42.033, |
|
"eval_steps_per_second": 4.203, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.039473684210526, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00019348909004686152, |
|
"loss": 1.1013, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.00019264940672148018, |
|
"loss": 1.1011, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.1710526315789473, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00019176086366117211, |
|
"loss": 1.0822, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.236842105263158, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.00019082392944938466, |
|
"loss": 1.0853, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.3026315789473686, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.0001898390981891979, |
|
"loss": 1.0778, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.3684210526315788, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00018880688924275378, |
|
"loss": 1.0601, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.4342105263157894, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.0001877278469573643, |
|
"loss": 1.0645, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 1.0663, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.5657894736842106, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.0001854315629494165, |
|
"loss": 1.0582, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00018421553219875658, |
|
"loss": 1.05, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.6973684210526314, |
|
"grad_norm": 2.375, |
|
"learning_rate": 0.00018295508941432815, |
|
"loss": 1.0463, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.763157894736842, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.0001816508993051943, |
|
"loss": 1.0336, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.8289473684210527, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.0001803036496510752, |
|
"loss": 1.0233, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.8947368421052633, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 1.0287, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.9605263157894735, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00017748283599182014, |
|
"loss": 1.0337, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.565699338912964, |
|
"eval_runtime": 0.2367, |
|
"eval_samples_per_second": 42.254, |
|
"eval_steps_per_second": 4.225, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 3.026315789473684, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00017601075957535364, |
|
"loss": 1.0152, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.0921052631578947, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00017449859800674371, |
|
"loss": 0.9987, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 3.1578947368421053, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 0.0001729471487418621, |
|
"loss": 0.9947, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.223684210526316, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00017135722995540107, |
|
"loss": 0.9919, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 3.2894736842105265, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.00016972968010939954, |
|
"loss": 0.9945, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.3552631578947367, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00016806535751107037, |
|
"loss": 0.9904, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 3.4210526315789473, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00016636513986016213, |
|
"loss": 0.9886, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.486842105263158, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00016462992378609407, |
|
"loss": 0.99, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 3.5526315789473686, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.0001628606243751082, |
|
"loss": 0.9947, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.6184210526315788, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00016105817468768798, |
|
"loss": 0.991, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 3.6842105263157894, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00015922352526649803, |
|
"loss": 0.982, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.0001573576436351046, |
|
"loss": 0.9797, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 3.8157894736842106, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.00015546151378774086, |
|
"loss": 0.9724, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.8815789473684212, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00015353613567038607, |
|
"loss": 0.9695, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 3.9473684210526314, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00015158252465343242, |
|
"loss": 0.9638, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.537924289703369, |
|
"eval_runtime": 0.2365, |
|
"eval_samples_per_second": 42.288, |
|
"eval_steps_per_second": 4.229, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 4.0131578947368425, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00014960171099621795, |
|
"loss": 0.9669, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 4.078947368421052, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00014759473930370736, |
|
"loss": 0.9414, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.144736842105263, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00014556266797560732, |
|
"loss": 0.9589, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 4.2105263157894735, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00014350656864820733, |
|
"loss": 0.9489, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.276315789473684, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00014142752562923988, |
|
"loss": 0.9502, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 4.342105263157895, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.0001393266353260583, |
|
"loss": 0.9626, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.407894736842105, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00013720500566743362, |
|
"loss": 0.9511, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 4.473684210526316, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.00013506375551927547, |
|
"loss": 0.9456, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.5394736842105265, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.00013290401409458532, |
|
"loss": 0.9461, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 4.605263157894737, |
|
"grad_norm": 1.875, |
|
"learning_rate": 0.00013072692035795305, |
|
"loss": 0.9418, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.671052631578947, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00012853362242491053, |
|
"loss": 0.9481, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 4.7368421052631575, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00012632527695645993, |
|
"loss": 0.9461, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.802631578947368, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.00012410304854909495, |
|
"loss": 0.9493, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 4.868421052631579, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0001218681091206376, |
|
"loss": 0.9283, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.934210526315789, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0001196216372922136, |
|
"loss": 0.9395, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.9419, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.5376482009887695, |
|
"eval_runtime": 0.2349, |
|
"eval_samples_per_second": 42.567, |
|
"eval_steps_per_second": 4.257, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 5.065789473684211, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00011509884070392369, |
|
"loss": 0.9265, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 5.131578947368421, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00011282490109308633, |
|
"loss": 0.9248, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 5.197368421052632, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00011054419812250338, |
|
"loss": 0.915, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 5.2631578947368425, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00010825793454723325, |
|
"loss": 0.9219, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 5.328947368421053, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0001059673160547834, |
|
"loss": 0.9223, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 5.394736842105263, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00010367355062927726, |
|
"loss": 0.9236, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 5.4605263157894735, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00010137784791440965, |
|
"loss": 0.9252, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 5.526315789473684, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 9.908141857552737e-05, |
|
"loss": 0.917, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.592105263157895, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 9.678547366117083e-05, |
|
"loss": 0.9213, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 5.657894736842105, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 9.449122396441345e-05, |
|
"loss": 0.9098, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.723684210526316, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 9.219987938433621e-05, |
|
"loss": 0.9233, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 5.7894736842105265, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.991264828797319e-05, |
|
"loss": 0.9113, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.855263157894737, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 8.763073687306524e-05, |
|
"loss": 0.9198, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 5.921052631578947, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 8.535534853195786e-05, |
|
"loss": 0.9186, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.9868421052631575, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 8.308768321697815e-05, |
|
"loss": 0.9117, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.5333404541015625, |
|
"eval_runtime": 0.2353, |
|
"eval_samples_per_second": 42.503, |
|
"eval_steps_per_second": 4.25, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 6.052631578947368, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 8.082893680762619e-05, |
|
"loss": 0.9067, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 6.118421052631579, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 7.858030047991411e-05, |
|
"loss": 0.9008, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 6.184210526315789, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 7.634296007818576e-05, |
|
"loss": 0.8891, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 7.411809548974792e-05, |
|
"loss": 0.9044, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 6.315789473684211, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 7.190688002264308e-05, |
|
"loss": 0.8954, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 6.381578947368421, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 6.971047978689189e-05, |
|
"loss": 0.895, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 6.447368421052632, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 6.753005307953167e-05, |
|
"loss": 0.9084, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 6.5131578947368425, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 6.536674977377496e-05, |
|
"loss": 0.8972, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 6.578947368421053, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 6.322171071261071e-05, |
|
"loss": 0.8986, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 6.644736842105263, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 6.109606710716741e-05, |
|
"loss": 0.8889, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 6.7105263157894735, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 5.8990939940156e-05, |
|
"loss": 0.9018, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 6.776315789473684, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 5.690743937470657e-05, |
|
"loss": 0.9085, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 6.842105263157895, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 5.484666416891109e-05, |
|
"loss": 0.9082, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.907894736842105, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 5.280970109638047e-05, |
|
"loss": 0.8921, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 6.973684210526316, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 5.079762437312219e-05, |
|
"loss": 0.8944, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.5417115688323975, |
|
"eval_runtime": 0.2355, |
|
"eval_samples_per_second": 42.467, |
|
"eval_steps_per_second": 4.247, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 7.0394736842105265, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 4.8811495091039926e-05, |
|
"loss": 0.8912, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 7.105263157894737, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 4.685236065835443e-05, |
|
"loss": 0.8893, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 7.171052631578948, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 4.492125424724086e-05, |
|
"loss": 0.8881, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 7.2368421052631575, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 4.301919424897338e-05, |
|
"loss": 0.8913, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 7.302631578947368, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 4.114718373686481e-05, |
|
"loss": 0.8871, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 7.368421052631579, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 3.9306209937284346e-05, |
|
"loss": 0.8854, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 7.434210526315789, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 3.749724370903216e-05, |
|
"loss": 0.8908, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 0.8887, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 7.565789473684211, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 3.3979132500809405e-05, |
|
"loss": 0.8858, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 7.631578947368421, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 3.227184283742591e-05, |
|
"loss": 0.8847, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 7.697368421052632, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 3.0600270400122335e-05, |
|
"loss": 0.8808, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 7.7631578947368425, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 2.89652967119336e-05, |
|
"loss": 0.8825, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 7.828947368421053, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 2.73677839951215e-05, |
|
"loss": 0.8923, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 7.894736842105263, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 2.5808574716471856e-05, |
|
"loss": 0.8809, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 7.9605263157894735, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 2.4288491143009795e-05, |
|
"loss": 0.8824, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.547354221343994, |
|
"eval_runtime": 0.2365, |
|
"eval_samples_per_second": 42.286, |
|
"eval_steps_per_second": 4.229, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 8.026315789473685, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 2.2808334908367914e-05, |
|
"loss": 0.8926, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 8.092105263157896, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 2.1368886590035443e-05, |
|
"loss": 0.8774, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 8.157894736842104, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 1.9970905297711606e-05, |
|
"loss": 0.8834, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 8.223684210526315, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 1.861512827298051e-05, |
|
"loss": 0.8845, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 8.289473684210526, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 1.7302270500518182e-05, |
|
"loss": 0.8801, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 8.355263157894736, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 1.6033024331037138e-05, |
|
"loss": 0.873, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 8.421052631578947, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 1.4808059116167305e-05, |
|
"loss": 0.8795, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 8.486842105263158, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 1.3628020855465572e-05, |
|
"loss": 0.8807, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 8.552631578947368, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 1.2493531855740625e-05, |
|
"loss": 0.8821, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 8.618421052631579, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 1.1405190402872202e-05, |
|
"loss": 0.8731, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 8.68421052631579, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 1.0363570446297999e-05, |
|
"loss": 0.8878, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 9.369221296335006e-06, |
|
"loss": 0.8827, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 8.81578947368421, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 8.422667334494249e-06, |
|
"loss": 0.8847, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 8.881578947368421, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 7.524407736942174e-06, |
|
"loss": 0.8844, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 8.947368421052632, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 6.674916211254289e-06, |
|
"loss": 0.8759, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 2.5541369915008545, |
|
"eval_runtime": 0.2345, |
|
"eval_samples_per_second": 42.638, |
|
"eval_steps_per_second": 4.264, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 9.013157894736842, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 5.8746407466000464e-06, |
|
"loss": 0.8768, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 9.078947368421053, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 5.124003377490582e-06, |
|
"loss": 0.8787, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 9.144736842105264, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 4.423399961213892e-06, |
|
"loss": 0.8874, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 9.210526315789474, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 3.7731999690749585e-06, |
|
"loss": 0.875, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 9.276315789473685, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 3.1737462915508277e-06, |
|
"loss": 0.8777, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 9.342105263157896, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 2.6253550574632303e-06, |
|
"loss": 0.8728, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 9.407894736842104, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 2.128315467264552e-06, |
|
"loss": 0.8791, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 9.473684210526315, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 1.6828896405244988e-06, |
|
"loss": 0.8809, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 9.539473684210526, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 1.28931247769839e-06, |
|
"loss": 0.8836, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 9.605263157894736, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 9.477915362496758e-07, |
|
"loss": 0.8819, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 9.671052631578947, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 6.585069211921035e-07, |
|
"loss": 0.874, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 9.736842105263158, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 4.216111901092501e-07, |
|
"loss": 0.8871, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 9.802631578947368, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 2.372292727015557e-07, |
|
"loss": 0.8811, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 9.868421052631579, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 1.0545840490313596e-07, |
|
"loss": 0.8768, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 9.93421052631579, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 2.6368077603367015e-08, |
|
"loss": 0.8774, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.0, |
|
"loss": 0.8735, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 2.5523786544799805, |
|
"eval_runtime": 0.2353, |
|
"eval_samples_per_second": 42.494, |
|
"eval_steps_per_second": 4.249, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 760, |
|
"total_flos": 2.318334860390826e+18, |
|
"train_loss": 2.2621336485210217, |
|
"train_runtime": 1856.2046, |
|
"train_samples_per_second": 26.156, |
|
"train_steps_per_second": 0.409 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 760, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 2.318334860390826e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|