|
{ |
|
"best_metric": 1.4779504537582397, |
|
"best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-1800", |
|
"epoch": 3.2, |
|
"eval_steps": 100, |
|
"global_step": 1800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.6079277992248535, |
|
"learning_rate": 4.999960939662063e-05, |
|
"loss": 3.747, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.2283411026000977, |
|
"learning_rate": 4.999843759868819e-05, |
|
"loss": 3.5789, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 41.573001861572266, |
|
"learning_rate": 4.999648464281934e-05, |
|
"loss": 3.1683, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.080965518951416, |
|
"learning_rate": 4.9993750590040575e-05, |
|
"loss": 2.8275, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.576275825500488, |
|
"learning_rate": 4.999023552578632e-05, |
|
"loss": 2.6758, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 18.012842178344727, |
|
"learning_rate": 4.998593955989626e-05, |
|
"loss": 2.6287, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.738934516906738, |
|
"learning_rate": 4.9980862826611875e-05, |
|
"loss": 2.5284, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.353776216506958, |
|
"learning_rate": 4.9975005484572305e-05, |
|
"loss": 2.2608, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.6298699378967285, |
|
"learning_rate": 4.9968367716809374e-05, |
|
"loss": 2.2475, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 50.594207763671875, |
|
"learning_rate": 4.996094973074183e-05, |
|
"loss": 2.2007, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 2.126384735107422, |
|
"eval_runtime": 124.9221, |
|
"eval_samples_per_second": 8.005, |
|
"eval_steps_per_second": 2.001, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 10.225520133972168, |
|
"learning_rate": 4.995275175816891e-05, |
|
"loss": 1.9414, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.777626991271973, |
|
"learning_rate": 4.994377405526308e-05, |
|
"loss": 1.9729, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.133576393127441, |
|
"learning_rate": 4.993401690256203e-05, |
|
"loss": 2.0237, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.396271228790283, |
|
"learning_rate": 4.992348060495989e-05, |
|
"loss": 2.009, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.4974453449249268, |
|
"learning_rate": 4.991216549169776e-05, |
|
"loss": 2.032, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 12.256199836730957, |
|
"learning_rate": 4.990007191635334e-05, |
|
"loss": 1.9548, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 7.5634379386901855, |
|
"learning_rate": 4.988720025682995e-05, |
|
"loss": 1.8164, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 14.023727416992188, |
|
"learning_rate": 4.987355091534468e-05, |
|
"loss": 1.8517, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.622091293334961, |
|
"learning_rate": 4.985912431841584e-05, |
|
"loss": 2.0255, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.9935083389282227, |
|
"learning_rate": 4.9843920916849645e-05, |
|
"loss": 1.8777, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 1.8619400262832642, |
|
"eval_runtime": 124.8712, |
|
"eval_samples_per_second": 8.008, |
|
"eval_steps_per_second": 2.002, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 6.256485939025879, |
|
"learning_rate": 4.982794118572609e-05, |
|
"loss": 1.8885, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 13.212824821472168, |
|
"learning_rate": 4.981118562438414e-05, |
|
"loss": 1.7744, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 4.2626118659973145, |
|
"learning_rate": 4.9793654756406085e-05, |
|
"loss": 1.7545, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 4.217405796051025, |
|
"learning_rate": 4.9775349129601243e-05, |
|
"loss": 1.5633, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 22.393404006958008, |
|
"learning_rate": 4.9756269315988804e-05, |
|
"loss": 1.8871, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 3.6576473712921143, |
|
"learning_rate": 4.973641591177991e-05, |
|
"loss": 1.7037, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.2433271408081055, |
|
"learning_rate": 4.971578953735912e-05, |
|
"loss": 1.7631, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.7399721145629883, |
|
"learning_rate": 4.969439083726496e-05, |
|
"loss": 1.7714, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.575680255889893, |
|
"learning_rate": 4.967222048016979e-05, |
|
"loss": 1.8699, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 7.729683876037598, |
|
"learning_rate": 4.964927915885893e-05, |
|
"loss": 1.6566, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 1.7350378036499023, |
|
"eval_runtime": 124.9278, |
|
"eval_samples_per_second": 8.005, |
|
"eval_steps_per_second": 2.001, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.755899667739868, |
|
"learning_rate": 4.962556759020898e-05, |
|
"loss": 1.7193, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.513024091720581, |
|
"learning_rate": 4.960108651516545e-05, |
|
"loss": 1.852, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.7794790267944336, |
|
"learning_rate": 4.9575836698719605e-05, |
|
"loss": 1.6785, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.2256739139556885, |
|
"learning_rate": 4.954981892988451e-05, |
|
"loss": 1.6648, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.8756954669952393, |
|
"learning_rate": 4.952303402167047e-05, |
|
"loss": 1.6399, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 7.057961463928223, |
|
"learning_rate": 4.949548281105951e-05, |
|
"loss": 1.5875, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 4.63081169128418, |
|
"learning_rate": 4.946716615897932e-05, |
|
"loss": 1.6708, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 8.755204200744629, |
|
"learning_rate": 4.943808495027631e-05, |
|
"loss": 1.636, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 10.21866226196289, |
|
"learning_rate": 4.940824009368793e-05, |
|
"loss": 1.5714, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 5.44133186340332, |
|
"learning_rate": 4.937763252181434e-05, |
|
"loss": 1.4084, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_loss": 1.6840696334838867, |
|
"eval_runtime": 124.8851, |
|
"eval_samples_per_second": 8.007, |
|
"eval_steps_per_second": 2.002, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.056345224380493, |
|
"learning_rate": 4.934626319108923e-05, |
|
"loss": 1.7233, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.303133487701416, |
|
"learning_rate": 4.93141330817499e-05, |
|
"loss": 1.5374, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 5.2246623039245605, |
|
"learning_rate": 4.9281243197806726e-05, |
|
"loss": 1.8547, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.8070685863494873, |
|
"learning_rate": 4.924759456701167e-05, |
|
"loss": 1.5721, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.243337392807007, |
|
"learning_rate": 4.9213188240826245e-05, |
|
"loss": 1.4322, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.166132926940918, |
|
"learning_rate": 4.917802529438864e-05, |
|
"loss": 1.6621, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.54414701461792, |
|
"learning_rate": 4.9142106826480114e-05, |
|
"loss": 1.6088, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 9.983458518981934, |
|
"learning_rate": 4.910543395949067e-05, |
|
"loss": 1.6152, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 6.45111608505249, |
|
"learning_rate": 4.9068007839383946e-05, |
|
"loss": 1.6361, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 108.82310485839844, |
|
"learning_rate": 4.9029829635661475e-05, |
|
"loss": 1.7045, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"eval_loss": 1.6494970321655273, |
|
"eval_runtime": 124.6904, |
|
"eval_samples_per_second": 8.02, |
|
"eval_steps_per_second": 2.005, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 5.705786228179932, |
|
"learning_rate": 4.899090054132609e-05, |
|
"loss": 1.738, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 4.800131320953369, |
|
"learning_rate": 4.895122177284465e-05, |
|
"loss": 1.6218, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 10.11057186126709, |
|
"learning_rate": 4.891079457011005e-05, |
|
"loss": 1.5169, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 9.329095840454102, |
|
"learning_rate": 4.8869620196402436e-05, |
|
"loss": 1.7979, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.9115641117095947, |
|
"learning_rate": 4.882769993834978e-05, |
|
"loss": 1.7073, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 4.80266809463501, |
|
"learning_rate": 4.878503510588765e-05, |
|
"loss": 1.6541, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 9.07653522491455, |
|
"learning_rate": 4.874162703221823e-05, |
|
"loss": 1.6888, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 4.492751598358154, |
|
"learning_rate": 4.8697477073768766e-05, |
|
"loss": 1.6448, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 13.852599143981934, |
|
"learning_rate": 4.8652586610149095e-05, |
|
"loss": 1.6236, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 5.424524307250977, |
|
"learning_rate": 4.8606957044108556e-05, |
|
"loss": 1.4969, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"eval_loss": 1.6121476888656616, |
|
"eval_runtime": 124.7413, |
|
"eval_samples_per_second": 8.017, |
|
"eval_steps_per_second": 2.004, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 3.611617088317871, |
|
"learning_rate": 4.856058980149216e-05, |
|
"loss": 1.4571, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 4.210519313812256, |
|
"learning_rate": 4.851348633119606e-05, |
|
"loss": 1.63, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 95.43629455566406, |
|
"learning_rate": 4.84656481051222e-05, |
|
"loss": 1.6034, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 4.3693528175354, |
|
"learning_rate": 4.8417076618132426e-05, |
|
"loss": 1.5791, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.691178321838379, |
|
"learning_rate": 4.836777338800168e-05, |
|
"loss": 1.5327, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 3.547637939453125, |
|
"learning_rate": 4.8317739955370636e-05, |
|
"loss": 1.4278, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 3.426717519760132, |
|
"learning_rate": 4.8266977883697515e-05, |
|
"loss": 1.5317, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 3.004473924636841, |
|
"learning_rate": 4.821548875920927e-05, |
|
"loss": 1.6848, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 3.686044931411743, |
|
"learning_rate": 4.816327419085196e-05, |
|
"loss": 1.6079, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 4.130298137664795, |
|
"learning_rate": 4.811033581024056e-05, |
|
"loss": 1.5998, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"eval_loss": 1.5970302820205688, |
|
"eval_runtime": 124.9388, |
|
"eval_samples_per_second": 8.004, |
|
"eval_steps_per_second": 2.001, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 6.1143059730529785, |
|
"learning_rate": 4.805667527160788e-05, |
|
"loss": 1.554, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 31.27813148498535, |
|
"learning_rate": 4.800229425175294e-05, |
|
"loss": 1.5824, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 9.035768508911133, |
|
"learning_rate": 4.7947194449988555e-05, |
|
"loss": 1.547, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 39.38993835449219, |
|
"learning_rate": 4.7891377588088223e-05, |
|
"loss": 1.5795, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 7.738800048828125, |
|
"learning_rate": 4.7834845410232356e-05, |
|
"loss": 1.5761, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 3.3933961391448975, |
|
"learning_rate": 4.777759968295369e-05, |
|
"loss": 1.6293, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 4.511744022369385, |
|
"learning_rate": 4.771964219508222e-05, |
|
"loss": 1.4761, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 3.566397190093994, |
|
"learning_rate": 4.766097475768919e-05, |
|
"loss": 1.5707, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 9.365654945373535, |
|
"learning_rate": 4.7601599204030544e-05, |
|
"loss": 1.3932, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 3.3254847526550293, |
|
"learning_rate": 4.754151738948962e-05, |
|
"loss": 1.6041, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"eval_loss": 1.5639870166778564, |
|
"eval_runtime": 124.923, |
|
"eval_samples_per_second": 8.005, |
|
"eval_steps_per_second": 2.001, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 3.520264148712158, |
|
"learning_rate": 4.7480731191519224e-05, |
|
"loss": 1.4991, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 5.3987531661987305, |
|
"learning_rate": 4.741924250958289e-05, |
|
"loss": 1.6856, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 12.352794647216797, |
|
"learning_rate": 4.7357053265095575e-05, |
|
"loss": 1.4509, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 9.825531005859375, |
|
"learning_rate": 4.729416540136361e-05, |
|
"loss": 1.6168, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 10.881526947021484, |
|
"learning_rate": 4.723058088352395e-05, |
|
"loss": 1.5783, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 6.232407093048096, |
|
"learning_rate": 4.7166301698482815e-05, |
|
"loss": 1.4556, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 3.3216302394866943, |
|
"learning_rate": 4.710132985485355e-05, |
|
"loss": 1.593, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 5.219264984130859, |
|
"learning_rate": 4.703566738289389e-05, |
|
"loss": 1.5131, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 7.875769138336182, |
|
"learning_rate": 4.696931633444251e-05, |
|
"loss": 1.5667, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 5.77959680557251, |
|
"learning_rate": 4.69022787828549e-05, |
|
"loss": 1.5211, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 1.5731443166732788, |
|
"eval_runtime": 124.8025, |
|
"eval_samples_per_second": 8.013, |
|
"eval_steps_per_second": 2.003, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 4.806954383850098, |
|
"learning_rate": 4.683455682293863e-05, |
|
"loss": 1.6824, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 5.980200290679932, |
|
"learning_rate": 4.676615257088776e-05, |
|
"loss": 1.5989, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 4.3645429611206055, |
|
"learning_rate": 4.6697068164216896e-05, |
|
"loss": 1.6469, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 3.2400012016296387, |
|
"learning_rate": 4.662730576169423e-05, |
|
"loss": 1.568, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 4.331827640533447, |
|
"learning_rate": 4.6556867543274184e-05, |
|
"loss": 1.5236, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 3.3798201084136963, |
|
"learning_rate": 4.6485755710029256e-05, |
|
"loss": 1.5046, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 5.440864086151123, |
|
"learning_rate": 4.6413972484081216e-05, |
|
"loss": 1.5816, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 5.852995872497559, |
|
"learning_rate": 4.6341520108531746e-05, |
|
"loss": 1.4193, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 4.2782206535339355, |
|
"learning_rate": 4.626840084739224e-05, |
|
"loss": 1.5457, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 8.631403923034668, |
|
"learning_rate": 4.619461698551315e-05, |
|
"loss": 1.652, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"eval_loss": 1.5386379957199097, |
|
"eval_runtime": 124.8384, |
|
"eval_samples_per_second": 8.01, |
|
"eval_steps_per_second": 2.003, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 4.581122875213623, |
|
"learning_rate": 4.612017082851253e-05, |
|
"loss": 1.5746, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 3.0373165607452393, |
|
"learning_rate": 4.604506470270403e-05, |
|
"loss": 1.6038, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 3.5066914558410645, |
|
"learning_rate": 4.5969300955024167e-05, |
|
"loss": 1.5725, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 4.402235507965088, |
|
"learning_rate": 4.589288195295901e-05, |
|
"loss": 1.5469, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 4.844370365142822, |
|
"learning_rate": 4.58158100844702e-05, |
|
"loss": 1.5424, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 4.146657943725586, |
|
"learning_rate": 4.573808775792033e-05, |
|
"loss": 1.4878, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 3.210528612136841, |
|
"learning_rate": 4.5659717401997655e-05, |
|
"loss": 1.6077, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 5.2232818603515625, |
|
"learning_rate": 4.5580701465640254e-05, |
|
"loss": 1.4824, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.8741068840026855, |
|
"learning_rate": 4.550104241795946e-05, |
|
"loss": 1.6172, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 8.092519760131836, |
|
"learning_rate": 4.5420742748162734e-05, |
|
"loss": 1.3659, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"eval_loss": 1.5198711156845093, |
|
"eval_runtime": 124.8546, |
|
"eval_samples_per_second": 8.009, |
|
"eval_steps_per_second": 2.002, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 5.068336009979248, |
|
"learning_rate": 4.5339804965475875e-05, |
|
"loss": 1.4661, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 13.167552947998047, |
|
"learning_rate": 4.525823159906459e-05, |
|
"loss": 1.411, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 4.712369918823242, |
|
"learning_rate": 4.5176025197955494e-05, |
|
"loss": 1.3309, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 7.261610507965088, |
|
"learning_rate": 4.509318833095642e-05, |
|
"loss": 1.3892, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 3.8006956577301025, |
|
"learning_rate": 4.500972358657618e-05, |
|
"loss": 1.3927, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 3.6301958560943604, |
|
"learning_rate": 4.492563357294369e-05, |
|
"loss": 1.4629, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 4.353027820587158, |
|
"learning_rate": 4.4840920917726426e-05, |
|
"loss": 1.352, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 3.375173807144165, |
|
"learning_rate": 4.475558826804833e-05, |
|
"loss": 1.4096, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 6.289668560028076, |
|
"learning_rate": 4.466963829040712e-05, |
|
"loss": 1.4834, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 4.517002582550049, |
|
"learning_rate": 4.458307367059092e-05, |
|
"loss": 1.4746, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"eval_loss": 1.5145190954208374, |
|
"eval_runtime": 124.8898, |
|
"eval_samples_per_second": 8.007, |
|
"eval_steps_per_second": 2.002, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 3.195769786834717, |
|
"learning_rate": 4.449589711359438e-05, |
|
"loss": 1.4149, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 3.751405715942383, |
|
"learning_rate": 4.440811134353412e-05, |
|
"loss": 1.5501, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 4.148709774017334, |
|
"learning_rate": 4.431971910356363e-05, |
|
"loss": 1.5253, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 20.003253936767578, |
|
"learning_rate": 4.42307231557875e-05, |
|
"loss": 1.6413, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 4.721023082733154, |
|
"learning_rate": 4.414112628117517e-05, |
|
"loss": 1.5608, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 4.672358989715576, |
|
"learning_rate": 4.4050931279474015e-05, |
|
"loss": 1.3646, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 4.073034286499023, |
|
"learning_rate": 4.396014096912182e-05, |
|
"loss": 1.3499, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 3.2312991619110107, |
|
"learning_rate": 4.386875818715874e-05, |
|
"loss": 1.4648, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 18.92267417907715, |
|
"learning_rate": 4.3776785789138675e-05, |
|
"loss": 1.4919, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 5.677367687225342, |
|
"learning_rate": 4.368422664903997e-05, |
|
"loss": 1.2891, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"eval_loss": 1.504623532295227, |
|
"eval_runtime": 124.8541, |
|
"eval_samples_per_second": 8.009, |
|
"eval_steps_per_second": 2.002, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 5.031940460205078, |
|
"learning_rate": 4.359108365917565e-05, |
|
"loss": 1.4939, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 7.701929092407227, |
|
"learning_rate": 4.349735973010305e-05, |
|
"loss": 1.28, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 5.7498040199279785, |
|
"learning_rate": 4.3403057790532855e-05, |
|
"loss": 1.4584, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 8.7277193069458, |
|
"learning_rate": 4.330818078723755e-05, |
|
"loss": 1.5871, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 13.915125846862793, |
|
"learning_rate": 4.32127316849594e-05, |
|
"loss": 1.3794, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 2.949733018875122, |
|
"learning_rate": 4.311671346631774e-05, |
|
"loss": 1.3543, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 5.377658843994141, |
|
"learning_rate": 4.302012913171584e-05, |
|
"loss": 1.3695, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 16.94107437133789, |
|
"learning_rate": 4.292298169924709e-05, |
|
"loss": 1.5168, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 4.190367221832275, |
|
"learning_rate": 4.282527420460072e-05, |
|
"loss": 1.4058, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 9.269573211669922, |
|
"learning_rate": 4.272700970096696e-05, |
|
"loss": 1.5794, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"eval_loss": 1.498180627822876, |
|
"eval_runtime": 124.7222, |
|
"eval_samples_per_second": 8.018, |
|
"eval_steps_per_second": 2.004, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 3.951293468475342, |
|
"learning_rate": 4.262819125894156e-05, |
|
"loss": 1.56, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 3.8725697994232178, |
|
"learning_rate": 4.252882196642992e-05, |
|
"loss": 1.5159, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 3.898501396179199, |
|
"learning_rate": 4.242890492855056e-05, |
|
"loss": 1.4659, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 5.807662487030029, |
|
"learning_rate": 4.23284432675381e-05, |
|
"loss": 1.5736, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 3.529371500015259, |
|
"learning_rate": 4.222744012264566e-05, |
|
"loss": 1.5011, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 6.336548805236816, |
|
"learning_rate": 4.212589865004684e-05, |
|
"loss": 1.6629, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 6.222330093383789, |
|
"learning_rate": 4.2023822022737016e-05, |
|
"loss": 1.5573, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 4.25172233581543, |
|
"learning_rate": 4.192121343043424e-05, |
|
"loss": 1.3817, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 4.487111568450928, |
|
"learning_rate": 4.181807607947954e-05, |
|
"loss": 1.5323, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 4.656155109405518, |
|
"learning_rate": 4.1714413192736754e-05, |
|
"loss": 1.3678, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"eval_loss": 1.5049968957901, |
|
"eval_runtime": 124.7803, |
|
"eval_samples_per_second": 8.014, |
|
"eval_steps_per_second": 2.004, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 4.431355953216553, |
|
"learning_rate": 4.161022800949177e-05, |
|
"loss": 1.486, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 18.211524963378906, |
|
"learning_rate": 4.150552378535137e-05, |
|
"loss": 1.4498, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 5.3755292892456055, |
|
"learning_rate": 4.140030379214147e-05, |
|
"loss": 1.4421, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 6.626212120056152, |
|
"learning_rate": 4.1294571317804854e-05, |
|
"loss": 1.4322, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 4.030793190002441, |
|
"learning_rate": 4.1188329666298464e-05, |
|
"loss": 1.3433, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 6.53309440612793, |
|
"learning_rate": 4.108158215749014e-05, |
|
"loss": 1.5604, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 3.76047420501709, |
|
"learning_rate": 4.0974332127054914e-05, |
|
"loss": 1.3259, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 4.58742094039917, |
|
"learning_rate": 4.0866582926370725e-05, |
|
"loss": 1.4228, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 4.566816806793213, |
|
"learning_rate": 4.0758337922413716e-05, |
|
"loss": 1.3013, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 6.218478202819824, |
|
"learning_rate": 4.064960049765304e-05, |
|
"loss": 1.5061, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"eval_loss": 1.4853577613830566, |
|
"eval_runtime": 124.7889, |
|
"eval_samples_per_second": 8.014, |
|
"eval_steps_per_second": 2.003, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 13.811309814453125, |
|
"learning_rate": 4.054037404994516e-05, |
|
"loss": 1.4839, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 5.560975074768066, |
|
"learning_rate": 4.043066199242762e-05, |
|
"loss": 1.4765, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 35.27302551269531, |
|
"learning_rate": 4.032046775341247e-05, |
|
"loss": 1.4105, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 4.9896745681762695, |
|
"learning_rate": 4.020979477627907e-05, |
|
"loss": 1.5688, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 3.5250892639160156, |
|
"learning_rate": 4.0098646519366534e-05, |
|
"loss": 1.4484, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 5.281729698181152, |
|
"learning_rate": 3.998702645586565e-05, |
|
"loss": 1.6017, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 4.667525768280029, |
|
"learning_rate": 3.9874938073710336e-05, |
|
"loss": 1.5006, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 4.294438362121582, |
|
"learning_rate": 3.976238487546864e-05, |
|
"loss": 1.4218, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 4.070734977722168, |
|
"learning_rate": 3.9649370378233365e-05, |
|
"loss": 1.6569, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 4.640359878540039, |
|
"learning_rate": 3.953589811351204e-05, |
|
"loss": 1.5635, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"eval_loss": 1.478974461555481, |
|
"eval_runtime": 124.6852, |
|
"eval_samples_per_second": 8.02, |
|
"eval_steps_per_second": 2.005, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 4.43009090423584, |
|
"learning_rate": 3.94219716271167e-05, |
|
"loss": 1.3304, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 4.001712799072266, |
|
"learning_rate": 3.930759447905298e-05, |
|
"loss": 1.3534, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 4.664085388183594, |
|
"learning_rate": 3.919277024340891e-05, |
|
"loss": 1.368, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 4.42681360244751, |
|
"learning_rate": 3.907750250824327e-05, |
|
"loss": 1.4164, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 7.331808567047119, |
|
"learning_rate": 3.8961794875473394e-05, |
|
"loss": 1.4333, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 5.612239837646484, |
|
"learning_rate": 3.884565096076269e-05, |
|
"loss": 1.5754, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 5.236481666564941, |
|
"learning_rate": 3.872907439340758e-05, |
|
"loss": 1.4017, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 4.995403289794922, |
|
"learning_rate": 3.861206881622419e-05, |
|
"loss": 1.5011, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 41.0167236328125, |
|
"learning_rate": 3.8494637885434396e-05, |
|
"loss": 1.4472, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 5.136650562286377, |
|
"learning_rate": 3.837678527055168e-05, |
|
"loss": 1.3939, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_loss": 1.4779504537582397, |
|
"eval_runtime": 124.6728, |
|
"eval_samples_per_second": 8.021, |
|
"eval_steps_per_second": 2.005, |
|
"step": 1800 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5620, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 1.4031463107723264e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|