|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9946777054997042, |
|
"eval_steps": 500, |
|
"global_step": 1266, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02365464222353637, |
|
"grad_norm": 39.04922181795541, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9233, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04730928444707274, |
|
"grad_norm": 8.275301054500245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8198, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0709639266706091, |
|
"grad_norm": 16.91349063149131, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8094, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09461856889414548, |
|
"grad_norm": 14.148988171179118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7679, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11827321111768184, |
|
"grad_norm": 1.026404466612184, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7523, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1419278533412182, |
|
"grad_norm": 0.8542225183892468, |
|
"learning_rate": 5e-06, |
|
"loss": 0.725, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16558249556475457, |
|
"grad_norm": 0.7426518873227826, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7185, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18923713778829096, |
|
"grad_norm": 0.6764382829611749, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6999, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21289178001182732, |
|
"grad_norm": 0.5663920137594394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23654642223536368, |
|
"grad_norm": 0.6218835377066562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6968, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26020106445890007, |
|
"grad_norm": 0.601497886261039, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6885, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2838557066824364, |
|
"grad_norm": 0.8786039473525534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6863, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3075103489059728, |
|
"grad_norm": 0.5373447312315734, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6792, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.33116499112950915, |
|
"grad_norm": 0.5195473153997355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6746, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.35481963335304556, |
|
"grad_norm": 0.5999120946052041, |
|
"learning_rate": 5e-06, |
|
"loss": 0.68, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3784742755765819, |
|
"grad_norm": 0.5351205582865509, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6766, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4021289178001183, |
|
"grad_norm": 0.5425010920017291, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6839, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.42578356002365464, |
|
"grad_norm": 0.702582958286065, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6711, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 1.2564450479675764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6776, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.47309284447072736, |
|
"grad_norm": 0.6226292820244915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6695, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4967474866942638, |
|
"grad_norm": 0.6729451633589562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6656, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5204021289178001, |
|
"grad_norm": 0.72360498041814, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6699, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5440567711413364, |
|
"grad_norm": 0.5730443009789895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6693, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5677114133648729, |
|
"grad_norm": 0.6304204873609028, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6564, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5913660555884093, |
|
"grad_norm": 0.6327686473365169, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6707, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6150206978119456, |
|
"grad_norm": 0.5640335014419563, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6617, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.638675340035482, |
|
"grad_norm": 0.6026872513783592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6711, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6623299822590183, |
|
"grad_norm": 0.5194797310260643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6561, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6859846244825547, |
|
"grad_norm": 0.6658270095766984, |
|
"learning_rate": 5e-06, |
|
"loss": 0.663, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7096392667060911, |
|
"grad_norm": 0.5259962549449988, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6511, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7332939089296274, |
|
"grad_norm": 0.5776532705758929, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6649, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7569485511531638, |
|
"grad_norm": 0.5249892835904177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6648, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7806031933767001, |
|
"grad_norm": 0.5092145613062358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6614, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8042578356002366, |
|
"grad_norm": 0.5273167065626364, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6468, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8279124778237729, |
|
"grad_norm": 0.5666036582386984, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6562, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8515671200473093, |
|
"grad_norm": 0.6164474600239763, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6544, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8752217622708457, |
|
"grad_norm": 0.5854672267431167, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6547, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 0.5313196039449892, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6584, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9225310467179184, |
|
"grad_norm": 0.5598019500152581, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6533, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9461856889414547, |
|
"grad_norm": 0.6003027055491813, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6458, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9698403311649911, |
|
"grad_norm": 0.6341763962447327, |
|
"learning_rate": 5e-06, |
|
"loss": 0.656, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9934949733885275, |
|
"grad_norm": 0.6083507877729101, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6538, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9982259018332348, |
|
"eval_loss": 0.6558582186698914, |
|
"eval_runtime": 224.885, |
|
"eval_samples_per_second": 50.635, |
|
"eval_steps_per_second": 0.396, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.0171496156120639, |
|
"grad_norm": 0.9361782463296332, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6157, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0408042578356003, |
|
"grad_norm": 0.6204113010596938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6065, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0644589000591367, |
|
"grad_norm": 0.6564336264095381, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6048, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0881135422826729, |
|
"grad_norm": 0.5555833545533679, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6072, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1117681845062093, |
|
"grad_norm": 0.568736434370096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6026, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1354228267297457, |
|
"grad_norm": 0.5963174527245159, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6041, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1590774689532821, |
|
"grad_norm": 0.6296624775692966, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6036, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.1827321111768185, |
|
"grad_norm": 0.7667349314546962, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6068, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2063867534003547, |
|
"grad_norm": 0.6034621980970892, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6104, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2300413956238911, |
|
"grad_norm": 0.5825117431703367, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6026, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2536960378474276, |
|
"grad_norm": 0.5671081402783421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.604, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.277350680070964, |
|
"grad_norm": 0.5309591912112671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6132, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3010053222945004, |
|
"grad_norm": 0.5636046858771947, |
|
"learning_rate": 5e-06, |
|
"loss": 0.605, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3246599645180366, |
|
"grad_norm": 0.6623955102141809, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6082, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.348314606741573, |
|
"grad_norm": 0.5742305096790601, |
|
"learning_rate": 5e-06, |
|
"loss": 0.605, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3719692489651094, |
|
"grad_norm": 0.5167065988140831, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6091, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.3956238911886458, |
|
"grad_norm": 0.5112713876137833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6049, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4192785334121822, |
|
"grad_norm": 0.515536375353522, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6079, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4429331756357184, |
|
"grad_norm": 0.5943800369847494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.602, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4665878178592548, |
|
"grad_norm": 0.5570413849081146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6074, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.4902424600827913, |
|
"grad_norm": 0.5383074416990815, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6055, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5138971023063275, |
|
"grad_norm": 0.6221748842819845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6048, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.537551744529864, |
|
"grad_norm": 0.623130737085543, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6124, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.5612063867534003, |
|
"grad_norm": 0.7728758992657894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6073, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.5848610289769367, |
|
"grad_norm": 0.5531126954202661, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6023, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6085156712004731, |
|
"grad_norm": 0.8207249388527519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6041, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6321703134240093, |
|
"grad_norm": 0.7382668830128054, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6027, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.655824955647546, |
|
"grad_norm": 0.8181634349883082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6077, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6794795978710821, |
|
"grad_norm": 0.5715750112816181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.603, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7031342400946186, |
|
"grad_norm": 0.564060422032355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6137, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.726788882318155, |
|
"grad_norm": 0.5112934215435977, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6041, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.7504435245416912, |
|
"grad_norm": 0.6498040890743698, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6038, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.7740981667652278, |
|
"grad_norm": 0.6625174306160165, |
|
"learning_rate": 5e-06, |
|
"loss": 0.604, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.797752808988764, |
|
"grad_norm": 0.5200239654238437, |
|
"learning_rate": 5e-06, |
|
"loss": 0.604, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8214074512123004, |
|
"grad_norm": 0.5056250667365105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.601, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.8450620934358368, |
|
"grad_norm": 0.5465224841554837, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6043, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.868716735659373, |
|
"grad_norm": 0.5173445168820222, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6051, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.8923713778829097, |
|
"grad_norm": 0.5037163086029489, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6071, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.9160260201064458, |
|
"grad_norm": 0.5032092904194995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6097, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.9396806623299823, |
|
"grad_norm": 0.5153373413177225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6017, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.9633353045535187, |
|
"grad_norm": 0.48445434626456924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6097, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.9869899467770549, |
|
"grad_norm": 0.5493690840998718, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5996, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.9988172678888232, |
|
"eval_loss": 0.6458428502082825, |
|
"eval_runtime": 226.2611, |
|
"eval_samples_per_second": 50.327, |
|
"eval_steps_per_second": 0.393, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.0106445890005915, |
|
"grad_norm": 0.6225484532666892, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5795, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.0342992312241277, |
|
"grad_norm": 0.5819857678964343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5553, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.057953873447664, |
|
"grad_norm": 0.7185360534865078, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5506, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.0816085156712005, |
|
"grad_norm": 0.5134284842767335, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5539, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 0.5986326239884353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5609, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.1289178001182734, |
|
"grad_norm": 0.579714763513885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5546, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.1525724423418096, |
|
"grad_norm": 0.570292196409214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5586, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.1762270845653457, |
|
"grad_norm": 0.5811117001743673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5585, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.1998817267888824, |
|
"grad_norm": 0.554144816987719, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5562, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.2235363690124186, |
|
"grad_norm": 0.6493826388527278, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5565, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.247191011235955, |
|
"grad_norm": 0.5220557856218626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5694, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.2708456534594914, |
|
"grad_norm": 0.6378102281048501, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5602, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.2945002956830276, |
|
"grad_norm": 0.5497371876386185, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5628, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.3181549379065642, |
|
"grad_norm": 0.6521682175920844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5565, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.3418095801301004, |
|
"grad_norm": 0.5734936169662879, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5674, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.365464222353637, |
|
"grad_norm": 0.5394258314592499, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5569, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.3891188645771733, |
|
"grad_norm": 0.5306593171364488, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5502, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.4127735068007095, |
|
"grad_norm": 0.5344199954837688, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5549, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.436428149024246, |
|
"grad_norm": 0.5892751227456119, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5533, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.4600827912477823, |
|
"grad_norm": 0.6529042003930223, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5613, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.483737433471319, |
|
"grad_norm": 0.5765438425321338, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5646, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.507392075694855, |
|
"grad_norm": 0.6764490044193554, |
|
"learning_rate": 5e-06, |
|
"loss": 0.567, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.5310467179183913, |
|
"grad_norm": 0.5365218264481744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5532, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.554701360141928, |
|
"grad_norm": 0.6033785399498255, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5622, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.578356002365464, |
|
"grad_norm": 0.8004909937255467, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5661, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.6020106445890008, |
|
"grad_norm": 0.5819582134735406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5616, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.625665286812537, |
|
"grad_norm": 0.5537773395049099, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5628, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.649319929036073, |
|
"grad_norm": 0.5539615560141525, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5648, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.67297457125961, |
|
"grad_norm": 0.6206027218523953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5643, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.696629213483146, |
|
"grad_norm": 0.5108322877934205, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5586, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.7202838557066826, |
|
"grad_norm": 0.48797735494965916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5563, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.743938497930219, |
|
"grad_norm": 0.5823974142352172, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5671, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.767593140153755, |
|
"grad_norm": 0.8599218035136146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5723, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.7912477823772917, |
|
"grad_norm": 0.6555716714163583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5633, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.814902424600828, |
|
"grad_norm": 0.49879910164951613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5581, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.8385570668243645, |
|
"grad_norm": 0.5241725506783274, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5623, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.8622117090479007, |
|
"grad_norm": 0.6173811070502804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.569, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.885866351271437, |
|
"grad_norm": 0.5397292738316359, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5642, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.9095209934949735, |
|
"grad_norm": 0.7053290870019903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5593, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.9331756357185097, |
|
"grad_norm": 0.5500348460578961, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5591, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.9568302779420463, |
|
"grad_norm": 0.5833114667049699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5649, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.9804849201655825, |
|
"grad_norm": 0.569413301750619, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5577, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.9946777054997042, |
|
"eval_loss": 0.648719847202301, |
|
"eval_runtime": 225.8886, |
|
"eval_samples_per_second": 50.41, |
|
"eval_steps_per_second": 0.394, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 2.9946777054997042, |
|
"step": 1266, |
|
"total_flos": 2120178393415680.0, |
|
"train_loss": 0.6180764295478568, |
|
"train_runtime": 37891.5513, |
|
"train_samples_per_second": 17.129, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1266, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2120178393415680.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|