|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.99626400996264, |
|
"eval_steps": 500, |
|
"global_step": 1203, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.024906600249066, |
|
"grad_norm": 10.55736720812015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.899, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.049813200498132, |
|
"grad_norm": 2.4056192000192786, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7632, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.074719800747198, |
|
"grad_norm": 1.0776649113170846, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7213, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.099626400996264, |
|
"grad_norm": 1.2974590316911792, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6892, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12453300124533001, |
|
"grad_norm": 0.8664841939523225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6708, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.149439601494396, |
|
"grad_norm": 0.7916914326975599, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6504, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17434620174346202, |
|
"grad_norm": 0.8355113234133088, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6407, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.199252801992528, |
|
"grad_norm": 0.6185509662301869, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6377, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22415940224159403, |
|
"grad_norm": 0.5793150300164642, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6276, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24906600249066002, |
|
"grad_norm": 0.5522899454819212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6229, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.273972602739726, |
|
"grad_norm": 0.6471095275689863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6154, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.298879202988792, |
|
"grad_norm": 0.6264851589702339, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6104, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.32378580323785805, |
|
"grad_norm": 0.705297908293922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6094, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.34869240348692404, |
|
"grad_norm": 0.620653193497072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6079, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.37359900373599003, |
|
"grad_norm": 0.9444217688486999, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5983, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.398505603985056, |
|
"grad_norm": 0.6034795779576418, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6044, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.42341220423412207, |
|
"grad_norm": 0.4974203412589217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5962, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.44831880448318806, |
|
"grad_norm": 0.728230577535382, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5975, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.47322540473225405, |
|
"grad_norm": 0.49214457212025137, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5986, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.49813200498132004, |
|
"grad_norm": 0.5432395369586009, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5885, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.523038605230386, |
|
"grad_norm": 0.5473973456491285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5901, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.547945205479452, |
|
"grad_norm": 0.5759522711461309, |
|
"learning_rate": 5e-06, |
|
"loss": 0.586, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.572851805728518, |
|
"grad_norm": 0.5583334053126371, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5878, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.597758405977584, |
|
"grad_norm": 0.7058646192825392, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5818, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6226650062266501, |
|
"grad_norm": 0.5788660860127395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5853, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6475716064757161, |
|
"grad_norm": 0.6402936090502463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5769, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6724782067247821, |
|
"grad_norm": 0.5496743019012175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5762, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6973848069738481, |
|
"grad_norm": 0.5455503536049673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5808, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7222914072229141, |
|
"grad_norm": 0.5883422768851626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5747, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7471980074719801, |
|
"grad_norm": 0.7956726883043881, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5717, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.772104607721046, |
|
"grad_norm": 0.6813019491351078, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5722, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.797011207970112, |
|
"grad_norm": 0.5185381604929972, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5724, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.821917808219178, |
|
"grad_norm": 0.6909227667005327, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5691, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8468244084682441, |
|
"grad_norm": 0.4796775398588659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5652, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8717310087173101, |
|
"grad_norm": 0.5758165163544798, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5736, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8966376089663761, |
|
"grad_norm": 0.5892321233613834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5663, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9215442092154421, |
|
"grad_norm": 0.5088131774112012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5674, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9464508094645081, |
|
"grad_norm": 0.5017508892526636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.563, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9713574097135741, |
|
"grad_norm": 0.6797455724948757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5674, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9962640099626401, |
|
"grad_norm": 0.5319255477795009, |
|
"learning_rate": 5e-06, |
|
"loss": 0.562, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9987546699875467, |
|
"eval_loss": 0.5648990869522095, |
|
"eval_runtime": 215.2105, |
|
"eval_samples_per_second": 50.272, |
|
"eval_steps_per_second": 0.395, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.0211706102117062, |
|
"grad_norm": 0.6332423226665538, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5373, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.046077210460772, |
|
"grad_norm": 0.5250206863412192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5262, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0709838107098382, |
|
"grad_norm": 0.6270808658082102, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5275, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.095890410958904, |
|
"grad_norm": 0.5516867253896128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5262, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1207970112079702, |
|
"grad_norm": 0.5358792491430266, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5319, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.145703611457036, |
|
"grad_norm": 0.7694988300392477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5251, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1706102117061021, |
|
"grad_norm": 0.5425248792129497, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5255, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.195516811955168, |
|
"grad_norm": 0.6608067800821592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5312, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2204234122042341, |
|
"grad_norm": 0.5892306715949471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5292, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2453300124533002, |
|
"grad_norm": 0.6342257885292061, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5282, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.270236612702366, |
|
"grad_norm": 0.6358865929230534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5256, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2951432129514322, |
|
"grad_norm": 0.5977464268642516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5216, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.320049813200498, |
|
"grad_norm": 0.5713712938047752, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5246, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3449564134495642, |
|
"grad_norm": 0.5806626902728351, |
|
"learning_rate": 5e-06, |
|
"loss": 0.522, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.36986301369863, |
|
"grad_norm": 0.6614502597905593, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5232, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3947696139476962, |
|
"grad_norm": 0.5826714843440824, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5243, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.419676214196762, |
|
"grad_norm": 0.5870354154595686, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5277, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.4445828144458281, |
|
"grad_norm": 0.5517404762107168, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5185, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4694894146948942, |
|
"grad_norm": 0.6548505841520894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5254, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4943960149439601, |
|
"grad_norm": 0.4798373315690756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5223, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.519302615193026, |
|
"grad_norm": 0.561260368162772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5208, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.544209215442092, |
|
"grad_norm": 0.4776435289710049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5169, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5691158156911582, |
|
"grad_norm": 0.5736062837326033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.518, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5940224159402243, |
|
"grad_norm": 0.5605397398468376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.519, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6189290161892902, |
|
"grad_norm": 0.5414003173485504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5141, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.643835616438356, |
|
"grad_norm": 0.5029725162688677, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5123, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6687422166874222, |
|
"grad_norm": 0.49094415690364374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5103, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6936488169364883, |
|
"grad_norm": 0.6443717801298894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5173, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.7185554171855542, |
|
"grad_norm": 0.6131661065579275, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5123, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.74346201743462, |
|
"grad_norm": 0.5102402228309177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5117, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7683686176836861, |
|
"grad_norm": 0.6532768204087063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5116, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7932752179327522, |
|
"grad_norm": 0.5359103919656766, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5112, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.4654117626073739, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5108, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.8430884184308842, |
|
"grad_norm": 0.4816605018327504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5125, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.86799501867995, |
|
"grad_norm": 0.5078445355540058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5174, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.8929016189290162, |
|
"grad_norm": 0.5123586401271214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5103, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.9178082191780823, |
|
"grad_norm": 0.512594100070009, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5125, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.9427148194271482, |
|
"grad_norm": 0.5251535719075618, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5097, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.967621419676214, |
|
"grad_norm": 0.5598844434696433, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5114, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.9925280199252802, |
|
"grad_norm": 0.5580903372050761, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5112, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.5391710996627808, |
|
"eval_runtime": 215.2899, |
|
"eval_samples_per_second": 50.253, |
|
"eval_steps_per_second": 0.395, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 2.0174346201743463, |
|
"grad_norm": 0.7489739683512228, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4772, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.0423412204234124, |
|
"grad_norm": 0.6206410773051714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4773, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.067247820672478, |
|
"grad_norm": 0.5964416221117852, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4774, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.092154420921544, |
|
"grad_norm": 0.7072977668899793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4734, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.1170610211706102, |
|
"grad_norm": 0.4869213806678473, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4738, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.1419676214196763, |
|
"grad_norm": 0.5559708671866962, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4739, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.166874221668742, |
|
"grad_norm": 0.590838180422237, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4781, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.191780821917808, |
|
"grad_norm": 0.5449563311605545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4768, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.216687422166874, |
|
"grad_norm": 0.49509703126867355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4702, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.2415940224159403, |
|
"grad_norm": 0.583235606431074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4785, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.2665006226650064, |
|
"grad_norm": 0.5224340118110492, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4688, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.291407222914072, |
|
"grad_norm": 0.5751594164855693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4685, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.316313823163138, |
|
"grad_norm": 0.5440703036591379, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4773, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.3412204234122043, |
|
"grad_norm": 0.7610931862664628, |
|
"learning_rate": 5e-06, |
|
"loss": 0.472, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.3661270236612704, |
|
"grad_norm": 0.5531311055353737, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4708, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.391033623910336, |
|
"grad_norm": 0.511600827539314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4764, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.415940224159402, |
|
"grad_norm": 0.5621388473604839, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4792, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.4408468244084682, |
|
"grad_norm": 0.574647600528262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4722, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.4657534246575343, |
|
"grad_norm": 0.5174894983673209, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4752, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.4906600249066004, |
|
"grad_norm": 0.49985821335778424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4824, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.515566625155666, |
|
"grad_norm": 0.6533346544154589, |
|
"learning_rate": 5e-06, |
|
"loss": 0.468, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.540473225404732, |
|
"grad_norm": 0.5759385792339666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4759, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.5653798256537983, |
|
"grad_norm": 0.5706835517348989, |
|
"learning_rate": 5e-06, |
|
"loss": 0.473, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.5902864259028644, |
|
"grad_norm": 0.48023447741714353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4758, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.61519302615193, |
|
"grad_norm": 0.48468151801419096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4721, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.640099626400996, |
|
"grad_norm": 0.5091065263641408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.479, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.6650062266500623, |
|
"grad_norm": 0.5876312291375192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.471, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.6899128268991284, |
|
"grad_norm": 0.5815028633704035, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4762, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.7148194271481945, |
|
"grad_norm": 0.5605250992780186, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4735, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.73972602739726, |
|
"grad_norm": 0.5741942655339728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4747, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.7646326276463262, |
|
"grad_norm": 0.4775069856578055, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4757, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.7895392278953923, |
|
"grad_norm": 0.498224788090082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4741, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.8144458281444584, |
|
"grad_norm": 0.48637748558305866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4767, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.839352428393524, |
|
"grad_norm": 0.5090237580915907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4734, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.86425902864259, |
|
"grad_norm": 0.4947938642895003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4782, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.8891656288916563, |
|
"grad_norm": 0.5195966815447782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4803, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.9140722291407224, |
|
"grad_norm": 0.5212585367090384, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4722, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.9389788293897885, |
|
"grad_norm": 0.514479905781623, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4752, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.963885429638854, |
|
"grad_norm": 0.5039971892926054, |
|
"learning_rate": 5e-06, |
|
"loss": 0.475, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.9887920298879203, |
|
"grad_norm": 0.4924909134578761, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4697, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.99626400996264, |
|
"eval_loss": 0.5331025123596191, |
|
"eval_runtime": 217.1359, |
|
"eval_samples_per_second": 49.826, |
|
"eval_steps_per_second": 0.391, |
|
"step": 1203 |
|
}, |
|
{ |
|
"epoch": 2.99626400996264, |
|
"step": 1203, |
|
"total_flos": 2014651046952960.0, |
|
"train_loss": 0.5350556444943397, |
|
"train_runtime": 35909.5127, |
|
"train_samples_per_second": 17.172, |
|
"train_steps_per_second": 0.034 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1203, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2014651046952960.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|