|
{ |
|
"best_metric": 1.5639870166778564, |
|
"best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-800", |
|
"epoch": 1.6, |
|
"eval_steps": 100, |
|
"global_step": 900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.6079277992248535, |
|
"learning_rate": 4.999960939662063e-05, |
|
"loss": 3.747, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.2283411026000977, |
|
"learning_rate": 4.999843759868819e-05, |
|
"loss": 3.5789, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 41.573001861572266, |
|
"learning_rate": 4.999648464281934e-05, |
|
"loss": 3.1683, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.080965518951416, |
|
"learning_rate": 4.9993750590040575e-05, |
|
"loss": 2.8275, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.576275825500488, |
|
"learning_rate": 4.999023552578632e-05, |
|
"loss": 2.6758, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 18.012842178344727, |
|
"learning_rate": 4.998593955989626e-05, |
|
"loss": 2.6287, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.738934516906738, |
|
"learning_rate": 4.9980862826611875e-05, |
|
"loss": 2.5284, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.353776216506958, |
|
"learning_rate": 4.9975005484572305e-05, |
|
"loss": 2.2608, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.6298699378967285, |
|
"learning_rate": 4.9968367716809374e-05, |
|
"loss": 2.2475, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 50.594207763671875, |
|
"learning_rate": 4.996094973074183e-05, |
|
"loss": 2.2007, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 2.126384735107422, |
|
"eval_runtime": 124.9221, |
|
"eval_samples_per_second": 8.005, |
|
"eval_steps_per_second": 2.001, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 10.225520133972168, |
|
"learning_rate": 4.995275175816891e-05, |
|
"loss": 1.9414, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.777626991271973, |
|
"learning_rate": 4.994377405526308e-05, |
|
"loss": 1.9729, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.133576393127441, |
|
"learning_rate": 4.993401690256203e-05, |
|
"loss": 2.0237, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.396271228790283, |
|
"learning_rate": 4.992348060495989e-05, |
|
"loss": 2.009, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.4974453449249268, |
|
"learning_rate": 4.991216549169776e-05, |
|
"loss": 2.032, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 12.256199836730957, |
|
"learning_rate": 4.990007191635334e-05, |
|
"loss": 1.9548, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 7.5634379386901855, |
|
"learning_rate": 4.988720025682995e-05, |
|
"loss": 1.8164, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 14.023727416992188, |
|
"learning_rate": 4.987355091534468e-05, |
|
"loss": 1.8517, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.622091293334961, |
|
"learning_rate": 4.985912431841584e-05, |
|
"loss": 2.0255, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.9935083389282227, |
|
"learning_rate": 4.9843920916849645e-05, |
|
"loss": 1.8777, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 1.8619400262832642, |
|
"eval_runtime": 124.8712, |
|
"eval_samples_per_second": 8.008, |
|
"eval_steps_per_second": 2.002, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 6.256485939025879, |
|
"learning_rate": 4.982794118572609e-05, |
|
"loss": 1.8885, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 13.212824821472168, |
|
"learning_rate": 4.981118562438414e-05, |
|
"loss": 1.7744, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 4.2626118659973145, |
|
"learning_rate": 4.9793654756406085e-05, |
|
"loss": 1.7545, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 4.217405796051025, |
|
"learning_rate": 4.9775349129601243e-05, |
|
"loss": 1.5633, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 22.393404006958008, |
|
"learning_rate": 4.9756269315988804e-05, |
|
"loss": 1.8871, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 3.6576473712921143, |
|
"learning_rate": 4.973641591177991e-05, |
|
"loss": 1.7037, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.2433271408081055, |
|
"learning_rate": 4.971578953735912e-05, |
|
"loss": 1.7631, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.7399721145629883, |
|
"learning_rate": 4.969439083726496e-05, |
|
"loss": 1.7714, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.575680255889893, |
|
"learning_rate": 4.967222048016979e-05, |
|
"loss": 1.8699, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 7.729683876037598, |
|
"learning_rate": 4.964927915885893e-05, |
|
"loss": 1.6566, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 1.7350378036499023, |
|
"eval_runtime": 124.9278, |
|
"eval_samples_per_second": 8.005, |
|
"eval_steps_per_second": 2.001, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.755899667739868, |
|
"learning_rate": 4.962556759020898e-05, |
|
"loss": 1.7193, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.513024091720581, |
|
"learning_rate": 4.960108651516545e-05, |
|
"loss": 1.852, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.7794790267944336, |
|
"learning_rate": 4.9575836698719605e-05, |
|
"loss": 1.6785, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.2256739139556885, |
|
"learning_rate": 4.954981892988451e-05, |
|
"loss": 1.6648, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.8756954669952393, |
|
"learning_rate": 4.952303402167047e-05, |
|
"loss": 1.6399, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 7.057961463928223, |
|
"learning_rate": 4.949548281105951e-05, |
|
"loss": 1.5875, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 4.63081169128418, |
|
"learning_rate": 4.946716615897932e-05, |
|
"loss": 1.6708, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 8.755204200744629, |
|
"learning_rate": 4.943808495027631e-05, |
|
"loss": 1.636, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 10.21866226196289, |
|
"learning_rate": 4.940824009368793e-05, |
|
"loss": 1.5714, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 5.44133186340332, |
|
"learning_rate": 4.937763252181434e-05, |
|
"loss": 1.4084, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_loss": 1.6840696334838867, |
|
"eval_runtime": 124.8851, |
|
"eval_samples_per_second": 8.007, |
|
"eval_steps_per_second": 2.002, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.056345224380493, |
|
"learning_rate": 4.934626319108923e-05, |
|
"loss": 1.7233, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.303133487701416, |
|
"learning_rate": 4.93141330817499e-05, |
|
"loss": 1.5374, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 5.2246623039245605, |
|
"learning_rate": 4.9281243197806726e-05, |
|
"loss": 1.8547, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.8070685863494873, |
|
"learning_rate": 4.924759456701167e-05, |
|
"loss": 1.5721, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.243337392807007, |
|
"learning_rate": 4.9213188240826245e-05, |
|
"loss": 1.4322, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.166132926940918, |
|
"learning_rate": 4.917802529438864e-05, |
|
"loss": 1.6621, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.54414701461792, |
|
"learning_rate": 4.9142106826480114e-05, |
|
"loss": 1.6088, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 9.983458518981934, |
|
"learning_rate": 4.910543395949067e-05, |
|
"loss": 1.6152, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 6.45111608505249, |
|
"learning_rate": 4.9068007839383946e-05, |
|
"loss": 1.6361, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 108.82310485839844, |
|
"learning_rate": 4.9029829635661475e-05, |
|
"loss": 1.7045, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"eval_loss": 1.6494970321655273, |
|
"eval_runtime": 124.6904, |
|
"eval_samples_per_second": 8.02, |
|
"eval_steps_per_second": 2.005, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 5.705786228179932, |
|
"learning_rate": 4.899090054132609e-05, |
|
"loss": 1.738, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 4.800131320953369, |
|
"learning_rate": 4.895122177284465e-05, |
|
"loss": 1.6218, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 10.11057186126709, |
|
"learning_rate": 4.891079457011005e-05, |
|
"loss": 1.5169, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 9.329095840454102, |
|
"learning_rate": 4.8869620196402436e-05, |
|
"loss": 1.7979, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.9115641117095947, |
|
"learning_rate": 4.882769993834978e-05, |
|
"loss": 1.7073, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 4.80266809463501, |
|
"learning_rate": 4.878503510588765e-05, |
|
"loss": 1.6541, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 9.07653522491455, |
|
"learning_rate": 4.874162703221823e-05, |
|
"loss": 1.6888, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 4.492751598358154, |
|
"learning_rate": 4.8697477073768766e-05, |
|
"loss": 1.6448, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 13.852599143981934, |
|
"learning_rate": 4.8652586610149095e-05, |
|
"loss": 1.6236, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 5.424524307250977, |
|
"learning_rate": 4.8606957044108556e-05, |
|
"loss": 1.4969, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"eval_loss": 1.6121476888656616, |
|
"eval_runtime": 124.7413, |
|
"eval_samples_per_second": 8.017, |
|
"eval_steps_per_second": 2.004, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 3.611617088317871, |
|
"learning_rate": 4.856058980149216e-05, |
|
"loss": 1.4571, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 4.210519313812256, |
|
"learning_rate": 4.851348633119606e-05, |
|
"loss": 1.63, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 95.43629455566406, |
|
"learning_rate": 4.84656481051222e-05, |
|
"loss": 1.6034, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 4.3693528175354, |
|
"learning_rate": 4.8417076618132426e-05, |
|
"loss": 1.5791, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.691178321838379, |
|
"learning_rate": 4.836777338800168e-05, |
|
"loss": 1.5327, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 3.547637939453125, |
|
"learning_rate": 4.8317739955370636e-05, |
|
"loss": 1.4278, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 3.426717519760132, |
|
"learning_rate": 4.8266977883697515e-05, |
|
"loss": 1.5317, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 3.004473924636841, |
|
"learning_rate": 4.821548875920927e-05, |
|
"loss": 1.6848, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 3.686044931411743, |
|
"learning_rate": 4.816327419085196e-05, |
|
"loss": 1.6079, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 4.130298137664795, |
|
"learning_rate": 4.811033581024056e-05, |
|
"loss": 1.5998, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"eval_loss": 1.5970302820205688, |
|
"eval_runtime": 124.9388, |
|
"eval_samples_per_second": 8.004, |
|
"eval_steps_per_second": 2.001, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 6.1143059730529785, |
|
"learning_rate": 4.805667527160788e-05, |
|
"loss": 1.554, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 31.27813148498535, |
|
"learning_rate": 4.800229425175294e-05, |
|
"loss": 1.5824, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 9.035768508911133, |
|
"learning_rate": 4.7947194449988555e-05, |
|
"loss": 1.547, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 39.38993835449219, |
|
"learning_rate": 4.7891377588088223e-05, |
|
"loss": 1.5795, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 7.738800048828125, |
|
"learning_rate": 4.7834845410232356e-05, |
|
"loss": 1.5761, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 3.3933961391448975, |
|
"learning_rate": 4.777759968295369e-05, |
|
"loss": 1.6293, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 4.511744022369385, |
|
"learning_rate": 4.771964219508222e-05, |
|
"loss": 1.4761, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 3.566397190093994, |
|
"learning_rate": 4.766097475768919e-05, |
|
"loss": 1.5707, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 9.365654945373535, |
|
"learning_rate": 4.7601599204030544e-05, |
|
"loss": 1.3932, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 3.3254847526550293, |
|
"learning_rate": 4.754151738948962e-05, |
|
"loss": 1.6041, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"eval_loss": 1.5639870166778564, |
|
"eval_runtime": 124.923, |
|
"eval_samples_per_second": 8.005, |
|
"eval_steps_per_second": 2.001, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 3.520264148712158, |
|
"learning_rate": 4.7480731191519224e-05, |
|
"loss": 1.4991, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 5.3987531661987305, |
|
"learning_rate": 4.741924250958289e-05, |
|
"loss": 1.6856, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 12.352794647216797, |
|
"learning_rate": 4.7357053265095575e-05, |
|
"loss": 1.4509, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 9.825531005859375, |
|
"learning_rate": 4.729416540136361e-05, |
|
"loss": 1.6168, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 10.881526947021484, |
|
"learning_rate": 4.723058088352395e-05, |
|
"loss": 1.5783, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 6.232407093048096, |
|
"learning_rate": 4.7166301698482815e-05, |
|
"loss": 1.4556, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 3.3216302394866943, |
|
"learning_rate": 4.710132985485355e-05, |
|
"loss": 1.593, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 5.219264984130859, |
|
"learning_rate": 4.703566738289389e-05, |
|
"loss": 1.5131, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 7.875769138336182, |
|
"learning_rate": 4.696931633444251e-05, |
|
"loss": 1.5667, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 5.77959680557251, |
|
"learning_rate": 4.69022787828549e-05, |
|
"loss": 1.5211, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 1.5731443166732788, |
|
"eval_runtime": 124.8025, |
|
"eval_samples_per_second": 8.013, |
|
"eval_steps_per_second": 2.003, |
|
"step": 900 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5620, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 7.080392918182134e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|