|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 37.31816101074219, |
|
"learning_rate": 1.9900000000000003e-05, |
|
"loss": 10.1104, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 35.192100524902344, |
|
"learning_rate": 1.98e-05, |
|
"loss": 7.9082, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 25.770776748657227, |
|
"learning_rate": 1.97e-05, |
|
"loss": 7.2319, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 31.321504592895508, |
|
"learning_rate": 1.9600000000000002e-05, |
|
"loss": 6.9716, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 34.35246658325195, |
|
"learning_rate": 1.95e-05, |
|
"loss": 6.5322, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 33.44606018066406, |
|
"learning_rate": 1.94e-05, |
|
"loss": 6.1754, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 38.797672271728516, |
|
"learning_rate": 1.93e-05, |
|
"loss": 5.7241, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 28.263782501220703, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 5.305, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 41.63703155517578, |
|
"learning_rate": 1.91e-05, |
|
"loss": 4.9392, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 31.407997131347656, |
|
"learning_rate": 1.9e-05, |
|
"loss": 4.4984, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 41.87642288208008, |
|
"learning_rate": 1.8900000000000002e-05, |
|
"loss": 4.1325, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 35.91765213012695, |
|
"learning_rate": 1.88e-05, |
|
"loss": 3.7626, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 25.258136749267578, |
|
"learning_rate": 1.8700000000000004e-05, |
|
"loss": 3.4081, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 20.95956802368164, |
|
"learning_rate": 1.86e-05, |
|
"loss": 3.0839, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 31.322267532348633, |
|
"learning_rate": 1.8500000000000002e-05, |
|
"loss": 2.8921, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 26.40801429748535, |
|
"learning_rate": 1.8400000000000003e-05, |
|
"loss": 2.4794, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 14.098700523376465, |
|
"learning_rate": 1.83e-05, |
|
"loss": 2.3116, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 13.986673355102539, |
|
"learning_rate": 1.8200000000000002e-05, |
|
"loss": 2.3836, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 9.944358825683594, |
|
"learning_rate": 1.8100000000000003e-05, |
|
"loss": 1.816, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 24.969406127929688, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.2833, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.525, |
|
"grad_norm": 8.329447746276855, |
|
"learning_rate": 1.79e-05, |
|
"loss": 1.868, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 13.586796760559082, |
|
"learning_rate": 1.7800000000000002e-05, |
|
"loss": 1.8955, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.575, |
|
"grad_norm": 9.99631118774414, |
|
"learning_rate": 1.77e-05, |
|
"loss": 1.7817, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 9.203180313110352, |
|
"learning_rate": 1.76e-05, |
|
"loss": 1.3639, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 14.064183235168457, |
|
"learning_rate": 1.7500000000000002e-05, |
|
"loss": 1.7589, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.787022590637207, |
|
"learning_rate": 1.7400000000000003e-05, |
|
"loss": 1.5226, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.675, |
|
"grad_norm": 8.696346282958984, |
|
"learning_rate": 1.73e-05, |
|
"loss": 1.6945, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.8470327854156494, |
|
"learning_rate": 1.72e-05, |
|
"loss": 1.515, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.725, |
|
"grad_norm": 14.381975173950195, |
|
"learning_rate": 1.7100000000000002e-05, |
|
"loss": 1.5426, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 11.33414077758789, |
|
"learning_rate": 1.7e-05, |
|
"loss": 1.9131, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.775, |
|
"grad_norm": 10.369054794311523, |
|
"learning_rate": 1.69e-05, |
|
"loss": 1.4884, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 10.790966987609863, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 1.5494, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.825, |
|
"grad_norm": 1.5759797096252441, |
|
"learning_rate": 1.67e-05, |
|
"loss": 1.4403, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 7.573840141296387, |
|
"learning_rate": 1.66e-05, |
|
"loss": 1.6335, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 6.928182125091553, |
|
"learning_rate": 1.65e-05, |
|
"loss": 1.6648, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 10.622973442077637, |
|
"learning_rate": 1.64e-05, |
|
"loss": 1.5502, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.925, |
|
"grad_norm": 15.8394136428833, |
|
"learning_rate": 1.63e-05, |
|
"loss": 1.8488, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 8.823399543762207, |
|
"learning_rate": 1.62e-05, |
|
"loss": 1.5358, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.975, |
|
"grad_norm": 13.792437553405762, |
|
"learning_rate": 1.6100000000000002e-05, |
|
"loss": 1.407, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 12.505993843078613, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.57, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.6697252988815308, |
|
"eval_runtime": 177.9001, |
|
"eval_samples_per_second": 2.248, |
|
"eval_steps_per_second": 0.281, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.025, |
|
"grad_norm": 2.272650957107544, |
|
"learning_rate": 1.5900000000000004e-05, |
|
"loss": 1.5599, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 8.81256103515625, |
|
"learning_rate": 1.58e-05, |
|
"loss": 1.5504, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.075, |
|
"grad_norm": 9.009818077087402, |
|
"learning_rate": 1.5700000000000002e-05, |
|
"loss": 1.4481, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 10.511366844177246, |
|
"learning_rate": 1.5600000000000003e-05, |
|
"loss": 1.6972, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 1.3748583793640137, |
|
"learning_rate": 1.55e-05, |
|
"loss": 1.4778, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 8.614971160888672, |
|
"learning_rate": 1.54e-05, |
|
"loss": 1.9503, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.175, |
|
"grad_norm": 4.834426403045654, |
|
"learning_rate": 1.5300000000000003e-05, |
|
"loss": 1.5342, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 3.6863138675689697, |
|
"learning_rate": 1.5200000000000002e-05, |
|
"loss": 1.6797, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.225, |
|
"grad_norm": 2.9746885299682617, |
|
"learning_rate": 1.5100000000000001e-05, |
|
"loss": 1.6074, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 7.708632469177246, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 1.6568, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.275, |
|
"grad_norm": 4.480324745178223, |
|
"learning_rate": 1.4900000000000001e-05, |
|
"loss": 1.5063, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 5.0911173820495605, |
|
"learning_rate": 1.48e-05, |
|
"loss": 1.59, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.325, |
|
"grad_norm": 1.7006248235702515, |
|
"learning_rate": 1.4700000000000002e-05, |
|
"loss": 1.5901, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 4.242706775665283, |
|
"learning_rate": 1.46e-05, |
|
"loss": 1.3782, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 7.67242956161499, |
|
"learning_rate": 1.45e-05, |
|
"loss": 1.4228, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 6.6367082595825195, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 1.973, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.425, |
|
"grad_norm": 4.86959981918335, |
|
"learning_rate": 1.43e-05, |
|
"loss": 1.7341, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.4599714279174805, |
|
"learning_rate": 1.4200000000000001e-05, |
|
"loss": 1.9434, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.475, |
|
"grad_norm": 1.5026886463165283, |
|
"learning_rate": 1.41e-05, |
|
"loss": 1.6237, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.1954498291015625, |
|
"learning_rate": 1.4e-05, |
|
"loss": 1.399, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.525, |
|
"grad_norm": 6.557270050048828, |
|
"learning_rate": 1.39e-05, |
|
"loss": 1.4375, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 7.460885524749756, |
|
"learning_rate": 1.38e-05, |
|
"loss": 1.7234, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.575, |
|
"grad_norm": 15.711455345153809, |
|
"learning_rate": 1.3700000000000003e-05, |
|
"loss": 1.4367, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 4.1283392906188965, |
|
"learning_rate": 1.3600000000000002e-05, |
|
"loss": 1.1727, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 4.098735332489014, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 1.4125, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 7.408526420593262, |
|
"learning_rate": 1.3400000000000002e-05, |
|
"loss": 1.6468, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.675, |
|
"grad_norm": 5.501112937927246, |
|
"learning_rate": 1.3300000000000001e-05, |
|
"loss": 1.8357, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 4.180479049682617, |
|
"learning_rate": 1.3200000000000002e-05, |
|
"loss": 1.3682, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.725, |
|
"grad_norm": 1.872931718826294, |
|
"learning_rate": 1.3100000000000002e-05, |
|
"loss": 1.5488, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.205283522605896, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 1.6343, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.775, |
|
"grad_norm": 2.79819917678833, |
|
"learning_rate": 1.2900000000000002e-05, |
|
"loss": 1.6792, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.925333023071289, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 1.63, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.825, |
|
"grad_norm": 11.69040298461914, |
|
"learning_rate": 1.27e-05, |
|
"loss": 1.8663, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 3.8195672035217285, |
|
"learning_rate": 1.2600000000000001e-05, |
|
"loss": 1.4885, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 6.955050468444824, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.6633, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 3.9344043731689453, |
|
"learning_rate": 1.2400000000000002e-05, |
|
"loss": 1.6519, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.925, |
|
"grad_norm": 3.235872983932495, |
|
"learning_rate": 1.23e-05, |
|
"loss": 1.4941, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 6.8774518966674805, |
|
"learning_rate": 1.22e-05, |
|
"loss": 1.694, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.975, |
|
"grad_norm": 1.166123867034912, |
|
"learning_rate": 1.2100000000000001e-05, |
|
"loss": 1.336, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 13.162934303283691, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.4602, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.6496708393096924, |
|
"eval_runtime": 179.3668, |
|
"eval_samples_per_second": 2.23, |
|
"eval_steps_per_second": 0.279, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.025, |
|
"grad_norm": 7.529654026031494, |
|
"learning_rate": 1.1900000000000001e-05, |
|
"loss": 1.5174, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 14.058820724487305, |
|
"learning_rate": 1.18e-05, |
|
"loss": 1.8126, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.075, |
|
"grad_norm": 11.69058895111084, |
|
"learning_rate": 1.17e-05, |
|
"loss": 1.5155, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 9.208683013916016, |
|
"learning_rate": 1.16e-05, |
|
"loss": 1.5014, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 10.896733283996582, |
|
"learning_rate": 1.15e-05, |
|
"loss": 1.5513, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.682322382926941, |
|
"learning_rate": 1.14e-05, |
|
"loss": 1.6424, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.175, |
|
"grad_norm": 8.488654136657715, |
|
"learning_rate": 1.13e-05, |
|
"loss": 1.3778, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 7.200578689575195, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 1.5253, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.225, |
|
"grad_norm": 2.4229137897491455, |
|
"learning_rate": 1.1100000000000002e-05, |
|
"loss": 1.6402, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.4099198579788208, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 1.6234, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.275, |
|
"grad_norm": 1.4259026050567627, |
|
"learning_rate": 1.0900000000000002e-05, |
|
"loss": 1.8434, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 2.157871723175049, |
|
"learning_rate": 1.0800000000000002e-05, |
|
"loss": 1.4639, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.325, |
|
"grad_norm": 6.281842231750488, |
|
"learning_rate": 1.0700000000000001e-05, |
|
"loss": 1.5842, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 12.432153701782227, |
|
"learning_rate": 1.0600000000000002e-05, |
|
"loss": 1.5704, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 1.2600141763687134, |
|
"learning_rate": 1.0500000000000001e-05, |
|
"loss": 1.5254, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.7496198415756226, |
|
"learning_rate": 1.04e-05, |
|
"loss": 1.654, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.425, |
|
"grad_norm": 3.237362861633301, |
|
"learning_rate": 1.0300000000000001e-05, |
|
"loss": 1.3958, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 11.880879402160645, |
|
"learning_rate": 1.02e-05, |
|
"loss": 1.8982, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.475, |
|
"grad_norm": 4.163928985595703, |
|
"learning_rate": 1.0100000000000002e-05, |
|
"loss": 1.3325, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 5.584897994995117, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7374, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.525, |
|
"grad_norm": 1.0842101573944092, |
|
"learning_rate": 9.9e-06, |
|
"loss": 1.6294, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 3.4661624431610107, |
|
"learning_rate": 9.800000000000001e-06, |
|
"loss": 1.8073, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.575, |
|
"grad_norm": 2.1857829093933105, |
|
"learning_rate": 9.7e-06, |
|
"loss": 1.5675, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.365864634513855, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 1.7444, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 6.100329875946045, |
|
"learning_rate": 9.5e-06, |
|
"loss": 1.3475, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 8.876641273498535, |
|
"learning_rate": 9.4e-06, |
|
"loss": 1.5366, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.675, |
|
"grad_norm": 9.202727317810059, |
|
"learning_rate": 9.3e-06, |
|
"loss": 1.7323, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 8.818689346313477, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 1.3941, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.725, |
|
"grad_norm": 1.5399378538131714, |
|
"learning_rate": 9.100000000000001e-06, |
|
"loss": 1.4933, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 4.687498569488525, |
|
"learning_rate": 9e-06, |
|
"loss": 1.5237, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.775, |
|
"grad_norm": 1.0228453874588013, |
|
"learning_rate": 8.900000000000001e-06, |
|
"loss": 1.3142, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 3.121901273727417, |
|
"learning_rate": 8.8e-06, |
|
"loss": 1.5802, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.825, |
|
"grad_norm": 5.920619487762451, |
|
"learning_rate": 8.700000000000001e-06, |
|
"loss": 1.6471, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.708122730255127, |
|
"learning_rate": 8.6e-06, |
|
"loss": 1.5874, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 6.527494430541992, |
|
"learning_rate": 8.5e-06, |
|
"loss": 1.5082, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 5.949902057647705, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 1.3408, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.925, |
|
"grad_norm": 1.2279417514801025, |
|
"learning_rate": 8.3e-06, |
|
"loss": 1.6026, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 5.212494850158691, |
|
"learning_rate": 8.2e-06, |
|
"loss": 1.5463, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.975, |
|
"grad_norm": 2.9334352016448975, |
|
"learning_rate": 8.1e-06, |
|
"loss": 1.7687, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 4.94165563583374, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.5553, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.6061710119247437, |
|
"eval_runtime": 179.0332, |
|
"eval_samples_per_second": 2.234, |
|
"eval_steps_per_second": 0.279, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.025, |
|
"grad_norm": 1.4166760444641113, |
|
"learning_rate": 7.9e-06, |
|
"loss": 1.6249, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 2.30269455909729, |
|
"learning_rate": 7.800000000000002e-06, |
|
"loss": 1.3924, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.075, |
|
"grad_norm": 3.9721744060516357, |
|
"learning_rate": 7.7e-06, |
|
"loss": 1.3955, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 6.161261558532715, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 1.7152, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 1.144785761833191, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.4025, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 3.4314322471618652, |
|
"learning_rate": 7.4e-06, |
|
"loss": 1.779, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.175, |
|
"grad_norm": 9.453422546386719, |
|
"learning_rate": 7.3e-06, |
|
"loss": 1.523, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 12.822498321533203, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 1.7858, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.225, |
|
"grad_norm": 2.504038095474243, |
|
"learning_rate": 7.100000000000001e-06, |
|
"loss": 1.6753, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 1.0970689058303833, |
|
"learning_rate": 7e-06, |
|
"loss": 1.6171, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.275, |
|
"grad_norm": 2.6205241680145264, |
|
"learning_rate": 6.9e-06, |
|
"loss": 1.2819, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 1.3209072351455688, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 1.6669, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.325, |
|
"grad_norm": 6.18387508392334, |
|
"learning_rate": 6.700000000000001e-06, |
|
"loss": 1.572, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 7.554809093475342, |
|
"learning_rate": 6.600000000000001e-06, |
|
"loss": 1.621, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 8.28931713104248, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 1.3405, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 7.459342956542969, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 1.7289, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.425, |
|
"grad_norm": 1.0719680786132812, |
|
"learning_rate": 6.300000000000001e-06, |
|
"loss": 1.3691, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 3.2976412773132324, |
|
"learning_rate": 6.200000000000001e-06, |
|
"loss": 1.6399, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.475, |
|
"grad_norm": 6.960519790649414, |
|
"learning_rate": 6.1e-06, |
|
"loss": 1.1903, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.1840627193450928, |
|
"learning_rate": 6e-06, |
|
"loss": 1.6722, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.525, |
|
"grad_norm": 10.626968383789062, |
|
"learning_rate": 5.9e-06, |
|
"loss": 1.4026, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 2.9099953174591064, |
|
"learning_rate": 5.8e-06, |
|
"loss": 1.333, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.575, |
|
"grad_norm": 6.874935150146484, |
|
"learning_rate": 5.7e-06, |
|
"loss": 1.444, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.6039921045303345, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 1.9615, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 3.3226425647735596, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 1.4471, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.8198592066764832, |
|
"learning_rate": 5.400000000000001e-06, |
|
"loss": 1.111, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.675, |
|
"grad_norm": 2.200191020965576, |
|
"learning_rate": 5.300000000000001e-06, |
|
"loss": 1.6108, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 1.6467258930206299, |
|
"learning_rate": 5.2e-06, |
|
"loss": 1.3662, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.725, |
|
"grad_norm": 21.577899932861328, |
|
"learning_rate": 5.1e-06, |
|
"loss": 1.7619, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 10.908743858337402, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5449, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.775, |
|
"grad_norm": 14.157689094543457, |
|
"learning_rate": 4.9000000000000005e-06, |
|
"loss": 1.701, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 2.318659543991089, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 1.6123, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.825, |
|
"grad_norm": 3.1259167194366455, |
|
"learning_rate": 4.7e-06, |
|
"loss": 1.4108, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.9985270500183105, |
|
"learning_rate": 4.600000000000001e-06, |
|
"loss": 1.3939, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"grad_norm": 18.413806915283203, |
|
"learning_rate": 4.5e-06, |
|
"loss": 2.0268, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 1.4327787160873413, |
|
"learning_rate": 4.4e-06, |
|
"loss": 1.4815, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.925, |
|
"grad_norm": 6.25641393661499, |
|
"learning_rate": 4.3e-06, |
|
"loss": 1.5863, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 7.975721836090088, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 1.8014, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.975, |
|
"grad_norm": 9.99906063079834, |
|
"learning_rate": 4.1e-06, |
|
"loss": 1.6257, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 6.815397262573242, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.7811, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.605211615562439, |
|
"eval_runtime": 179.9082, |
|
"eval_samples_per_second": 2.223, |
|
"eval_steps_per_second": 0.278, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.025, |
|
"grad_norm": 8.750836372375488, |
|
"learning_rate": 3.900000000000001e-06, |
|
"loss": 1.4724, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 10.807592391967773, |
|
"learning_rate": 3.8000000000000005e-06, |
|
"loss": 1.6807, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.075, |
|
"grad_norm": 23.482824325561523, |
|
"learning_rate": 3.7e-06, |
|
"loss": 1.5182, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 10.791790008544922, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 1.3766, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"grad_norm": 2.6700119972229004, |
|
"learning_rate": 3.5e-06, |
|
"loss": 1.6198, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 3.3935744762420654, |
|
"learning_rate": 3.4000000000000005e-06, |
|
"loss": 1.5944, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.175, |
|
"grad_norm": 4.310108184814453, |
|
"learning_rate": 3.3000000000000006e-06, |
|
"loss": 2.0028, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 5.834546089172363, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 1.3597, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 4.225, |
|
"grad_norm": 1.2702422142028809, |
|
"learning_rate": 3.1000000000000004e-06, |
|
"loss": 1.4279, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 1.1610503196716309, |
|
"learning_rate": 3e-06, |
|
"loss": 1.237, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.275, |
|
"grad_norm": 3.733497381210327, |
|
"learning_rate": 2.9e-06, |
|
"loss": 1.4688, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 1.0653811693191528, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 1.7002, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 4.325, |
|
"grad_norm": 5.7914934158325195, |
|
"learning_rate": 2.7000000000000004e-06, |
|
"loss": 1.5866, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 10.097025871276855, |
|
"learning_rate": 2.6e-06, |
|
"loss": 1.3058, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 2.2957890033721924, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.5006, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.5741922855377197, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 1.4366, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 4.425, |
|
"grad_norm": 9.413016319274902, |
|
"learning_rate": 2.3000000000000004e-06, |
|
"loss": 2.1316, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 11.494050979614258, |
|
"learning_rate": 2.2e-06, |
|
"loss": 1.7012, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 4.475, |
|
"grad_norm": 4.699800968170166, |
|
"learning_rate": 2.1000000000000002e-06, |
|
"loss": 1.4526, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 3.780972480773926, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.4928, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.525, |
|
"grad_norm": 9.663272857666016, |
|
"learning_rate": 1.9000000000000002e-06, |
|
"loss": 1.5854, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 1.1007126569747925, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 1.5926, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 4.575, |
|
"grad_norm": 3.3589682579040527, |
|
"learning_rate": 1.7000000000000002e-06, |
|
"loss": 1.3675, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 8.529759407043457, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 1.5693, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"grad_norm": 4.393673419952393, |
|
"learning_rate": 1.5e-06, |
|
"loss": 1.6012, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 23.471420288085938, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 1.5968, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 4.675, |
|
"grad_norm": 10.580934524536133, |
|
"learning_rate": 1.3e-06, |
|
"loss": 1.6844, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 4.5100579261779785, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 1.4989, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 4.725, |
|
"grad_norm": 8.684538841247559, |
|
"learning_rate": 1.1e-06, |
|
"loss": 1.7439, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 10.94477367401123, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.7858, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.775, |
|
"grad_norm": 9.082775115966797, |
|
"learning_rate": 9.000000000000001e-07, |
|
"loss": 1.2804, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 3.1917006969451904, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 1.3131, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 4.825, |
|
"grad_norm": 1.8910670280456543, |
|
"learning_rate": 7.000000000000001e-07, |
|
"loss": 1.5313, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 2.0198514461517334, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 1.5972, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"grad_norm": 8.048125267028809, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 1.5084, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 3.2687554359436035, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 1.563, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 4.925, |
|
"grad_norm": 3.7933993339538574, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": 1.3889, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 3.9348926544189453, |
|
"learning_rate": 2.0000000000000002e-07, |
|
"loss": 1.6613, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 4.975, |
|
"grad_norm": 3.605724573135376, |
|
"learning_rate": 1.0000000000000001e-07, |
|
"loss": 1.7075, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.839205265045166, |
|
"learning_rate": 0.0, |
|
"loss": 1.3854, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.747389217800192e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|