{ "best_metric": 1.4545879364013672, "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-2100", "epoch": 3.7333333333333334, "eval_steps": 100, "global_step": 2100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 6.6079277992248535, "learning_rate": 4.999960939662063e-05, "loss": 3.747, "step": 10 }, { "epoch": 0.04, "grad_norm": 3.2283411026000977, "learning_rate": 4.999843759868819e-05, "loss": 3.5789, "step": 20 }, { "epoch": 0.05, "grad_norm": 41.573001861572266, "learning_rate": 4.999648464281934e-05, "loss": 3.1683, "step": 30 }, { "epoch": 0.07, "grad_norm": 4.080965518951416, "learning_rate": 4.9993750590040575e-05, "loss": 2.8275, "step": 40 }, { "epoch": 0.09, "grad_norm": 4.576275825500488, "learning_rate": 4.999023552578632e-05, "loss": 2.6758, "step": 50 }, { "epoch": 0.11, "grad_norm": 18.012842178344727, "learning_rate": 4.998593955989626e-05, "loss": 2.6287, "step": 60 }, { "epoch": 0.12, "grad_norm": 5.738934516906738, "learning_rate": 4.9980862826611875e-05, "loss": 2.5284, "step": 70 }, { "epoch": 0.14, "grad_norm": 3.353776216506958, "learning_rate": 4.9975005484572305e-05, "loss": 2.2608, "step": 80 }, { "epoch": 0.16, "grad_norm": 4.6298699378967285, "learning_rate": 4.9968367716809374e-05, "loss": 2.2475, "step": 90 }, { "epoch": 0.18, "grad_norm": 50.594207763671875, "learning_rate": 4.996094973074183e-05, "loss": 2.2007, "step": 100 }, { "epoch": 0.18, "eval_loss": 2.126384735107422, "eval_runtime": 124.9221, "eval_samples_per_second": 8.005, "eval_steps_per_second": 2.001, "step": 100 }, { "epoch": 0.2, "grad_norm": 10.225520133972168, "learning_rate": 4.995275175816891e-05, "loss": 1.9414, "step": 110 }, { "epoch": 0.21, "grad_norm": 4.777626991271973, "learning_rate": 4.994377405526308e-05, "loss": 1.9729, "step": 120 }, { "epoch": 0.23, "grad_norm": 6.133576393127441, "learning_rate": 4.993401690256203e-05, "loss": 2.0237, "step": 130 }, { "epoch": 0.25, "grad_norm": 5.396271228790283, "learning_rate": 4.992348060495989e-05, "loss": 2.009, "step": 140 }, { "epoch": 0.27, "grad_norm": 3.4974453449249268, "learning_rate": 4.991216549169776e-05, "loss": 2.032, "step": 150 }, { "epoch": 0.28, "grad_norm": 12.256199836730957, "learning_rate": 4.990007191635334e-05, "loss": 1.9548, "step": 160 }, { "epoch": 0.3, "grad_norm": 7.5634379386901855, "learning_rate": 4.988720025682995e-05, "loss": 1.8164, "step": 170 }, { "epoch": 0.32, "grad_norm": 14.023727416992188, "learning_rate": 4.987355091534468e-05, "loss": 1.8517, "step": 180 }, { "epoch": 0.34, "grad_norm": 4.622091293334961, "learning_rate": 4.985912431841584e-05, "loss": 2.0255, "step": 190 }, { "epoch": 0.36, "grad_norm": 3.9935083389282227, "learning_rate": 4.9843920916849645e-05, "loss": 1.8777, "step": 200 }, { "epoch": 0.36, "eval_loss": 1.8619400262832642, "eval_runtime": 124.8712, "eval_samples_per_second": 8.008, "eval_steps_per_second": 2.002, "step": 200 }, { "epoch": 0.37, "grad_norm": 6.256485939025879, "learning_rate": 4.982794118572609e-05, "loss": 1.8885, "step": 210 }, { "epoch": 0.39, "grad_norm": 13.212824821472168, "learning_rate": 4.981118562438414e-05, "loss": 1.7744, "step": 220 }, { "epoch": 0.41, "grad_norm": 4.2626118659973145, "learning_rate": 4.9793654756406085e-05, "loss": 1.7545, "step": 230 }, { "epoch": 0.43, "grad_norm": 4.217405796051025, "learning_rate": 4.9775349129601243e-05, "loss": 1.5633, "step": 240 }, { "epoch": 0.44, "grad_norm": 22.393404006958008, "learning_rate": 4.9756269315988804e-05, "loss": 1.8871, "step": 250 }, { "epoch": 0.46, "grad_norm": 3.6576473712921143, "learning_rate": 4.973641591177991e-05, "loss": 1.7037, "step": 260 }, { "epoch": 0.48, "grad_norm": 4.2433271408081055, "learning_rate": 4.971578953735912e-05, "loss": 1.7631, "step": 270 }, { "epoch": 0.5, "grad_norm": 3.7399721145629883, "learning_rate": 4.969439083726496e-05, "loss": 1.7714, "step": 280 }, { "epoch": 0.52, "grad_norm": 4.575680255889893, "learning_rate": 4.967222048016979e-05, "loss": 1.8699, "step": 290 }, { "epoch": 0.53, "grad_norm": 7.729683876037598, "learning_rate": 4.964927915885893e-05, "loss": 1.6566, "step": 300 }, { "epoch": 0.53, "eval_loss": 1.7350378036499023, "eval_runtime": 124.9278, "eval_samples_per_second": 8.005, "eval_steps_per_second": 2.001, "step": 300 }, { "epoch": 0.55, "grad_norm": 2.755899667739868, "learning_rate": 4.962556759020898e-05, "loss": 1.7193, "step": 310 }, { "epoch": 0.57, "grad_norm": 3.513024091720581, "learning_rate": 4.960108651516545e-05, "loss": 1.852, "step": 320 }, { "epoch": 0.59, "grad_norm": 3.7794790267944336, "learning_rate": 4.9575836698719605e-05, "loss": 1.6785, "step": 330 }, { "epoch": 0.6, "grad_norm": 3.2256739139556885, "learning_rate": 4.954981892988451e-05, "loss": 1.6648, "step": 340 }, { "epoch": 0.62, "grad_norm": 2.8756954669952393, "learning_rate": 4.952303402167047e-05, "loss": 1.6399, "step": 350 }, { "epoch": 0.64, "grad_norm": 7.057961463928223, "learning_rate": 4.949548281105951e-05, "loss": 1.5875, "step": 360 }, { "epoch": 0.66, "grad_norm": 4.63081169128418, "learning_rate": 4.946716615897932e-05, "loss": 1.6708, "step": 370 }, { "epoch": 0.68, "grad_norm": 8.755204200744629, "learning_rate": 4.943808495027631e-05, "loss": 1.636, "step": 380 }, { "epoch": 0.69, "grad_norm": 10.21866226196289, "learning_rate": 4.940824009368793e-05, "loss": 1.5714, "step": 390 }, { "epoch": 0.71, "grad_norm": 5.44133186340332, "learning_rate": 4.937763252181434e-05, "loss": 1.4084, "step": 400 }, { "epoch": 0.71, "eval_loss": 1.6840696334838867, "eval_runtime": 124.8851, "eval_samples_per_second": 8.007, "eval_steps_per_second": 2.002, "step": 400 }, { "epoch": 0.73, "grad_norm": 3.056345224380493, "learning_rate": 4.934626319108923e-05, "loss": 1.7233, "step": 410 }, { "epoch": 0.75, "grad_norm": 4.303133487701416, "learning_rate": 4.93141330817499e-05, "loss": 1.5374, "step": 420 }, { "epoch": 0.76, "grad_norm": 5.2246623039245605, "learning_rate": 4.9281243197806726e-05, "loss": 1.8547, "step": 430 }, { "epoch": 0.78, "grad_norm": 3.8070685863494873, "learning_rate": 4.924759456701167e-05, "loss": 1.5721, "step": 440 }, { "epoch": 0.8, "grad_norm": 3.243337392807007, "learning_rate": 4.9213188240826245e-05, "loss": 1.4322, "step": 450 }, { "epoch": 0.82, "grad_norm": 4.166132926940918, "learning_rate": 4.917802529438864e-05, "loss": 1.6621, "step": 460 }, { "epoch": 0.84, "grad_norm": 4.54414701461792, "learning_rate": 4.9142106826480114e-05, "loss": 1.6088, "step": 470 }, { "epoch": 0.85, "grad_norm": 9.983458518981934, "learning_rate": 4.910543395949067e-05, "loss": 1.6152, "step": 480 }, { "epoch": 0.87, "grad_norm": 6.45111608505249, "learning_rate": 4.9068007839383946e-05, "loss": 1.6361, "step": 490 }, { "epoch": 0.89, "grad_norm": 108.82310485839844, "learning_rate": 4.9029829635661475e-05, "loss": 1.7045, "step": 500 }, { "epoch": 0.89, "eval_loss": 1.6494970321655273, "eval_runtime": 124.6904, "eval_samples_per_second": 8.02, "eval_steps_per_second": 2.005, "step": 500 }, { "epoch": 0.91, "grad_norm": 5.705786228179932, "learning_rate": 4.899090054132609e-05, "loss": 1.738, "step": 510 }, { "epoch": 0.92, "grad_norm": 4.800131320953369, "learning_rate": 4.895122177284465e-05, "loss": 1.6218, "step": 520 }, { "epoch": 0.94, "grad_norm": 10.11057186126709, "learning_rate": 4.891079457011005e-05, "loss": 1.5169, "step": 530 }, { "epoch": 0.96, "grad_norm": 9.329095840454102, "learning_rate": 4.8869620196402436e-05, "loss": 1.7979, "step": 540 }, { "epoch": 0.98, "grad_norm": 3.9115641117095947, "learning_rate": 4.882769993834978e-05, "loss": 1.7073, "step": 550 }, { "epoch": 1.0, "grad_norm": 4.80266809463501, "learning_rate": 4.878503510588765e-05, "loss": 1.6541, "step": 560 }, { "epoch": 1.01, "grad_norm": 9.07653522491455, "learning_rate": 4.874162703221823e-05, "loss": 1.6888, "step": 570 }, { "epoch": 1.03, "grad_norm": 4.492751598358154, "learning_rate": 4.8697477073768766e-05, "loss": 1.6448, "step": 580 }, { "epoch": 1.05, "grad_norm": 13.852599143981934, "learning_rate": 4.8652586610149095e-05, "loss": 1.6236, "step": 590 }, { "epoch": 1.07, "grad_norm": 5.424524307250977, "learning_rate": 4.8606957044108556e-05, "loss": 1.4969, "step": 600 }, { "epoch": 1.07, "eval_loss": 1.6121476888656616, "eval_runtime": 124.7413, "eval_samples_per_second": 8.017, "eval_steps_per_second": 2.004, "step": 600 }, { "epoch": 1.08, "grad_norm": 3.611617088317871, "learning_rate": 4.856058980149216e-05, "loss": 1.4571, "step": 610 }, { "epoch": 1.1, "grad_norm": 4.210519313812256, "learning_rate": 4.851348633119606e-05, "loss": 1.63, "step": 620 }, { "epoch": 1.12, "grad_norm": 95.43629455566406, "learning_rate": 4.84656481051222e-05, "loss": 1.6034, "step": 630 }, { "epoch": 1.14, "grad_norm": 4.3693528175354, "learning_rate": 4.8417076618132426e-05, "loss": 1.5791, "step": 640 }, { "epoch": 1.16, "grad_norm": 3.691178321838379, "learning_rate": 4.836777338800168e-05, "loss": 1.5327, "step": 650 }, { "epoch": 1.17, "grad_norm": 3.547637939453125, "learning_rate": 4.8317739955370636e-05, "loss": 1.4278, "step": 660 }, { "epoch": 1.19, "grad_norm": 3.426717519760132, "learning_rate": 4.8266977883697515e-05, "loss": 1.5317, "step": 670 }, { "epoch": 1.21, "grad_norm": 3.004473924636841, "learning_rate": 4.821548875920927e-05, "loss": 1.6848, "step": 680 }, { "epoch": 1.23, "grad_norm": 3.686044931411743, "learning_rate": 4.816327419085196e-05, "loss": 1.6079, "step": 690 }, { "epoch": 1.24, "grad_norm": 4.130298137664795, "learning_rate": 4.811033581024056e-05, "loss": 1.5998, "step": 700 }, { "epoch": 1.24, "eval_loss": 1.5970302820205688, "eval_runtime": 124.9388, "eval_samples_per_second": 8.004, "eval_steps_per_second": 2.001, "step": 700 }, { "epoch": 1.26, "grad_norm": 6.1143059730529785, "learning_rate": 4.805667527160788e-05, "loss": 1.554, "step": 710 }, { "epoch": 1.28, "grad_norm": 31.27813148498535, "learning_rate": 4.800229425175294e-05, "loss": 1.5824, "step": 720 }, { "epoch": 1.3, "grad_norm": 9.035768508911133, "learning_rate": 4.7947194449988555e-05, "loss": 1.547, "step": 730 }, { "epoch": 1.32, "grad_norm": 39.38993835449219, "learning_rate": 4.7891377588088223e-05, "loss": 1.5795, "step": 740 }, { "epoch": 1.33, "grad_norm": 7.738800048828125, "learning_rate": 4.7834845410232356e-05, "loss": 1.5761, "step": 750 }, { "epoch": 1.35, "grad_norm": 3.3933961391448975, "learning_rate": 4.777759968295369e-05, "loss": 1.6293, "step": 760 }, { "epoch": 1.37, "grad_norm": 4.511744022369385, "learning_rate": 4.771964219508222e-05, "loss": 1.4761, "step": 770 }, { "epoch": 1.39, "grad_norm": 3.566397190093994, "learning_rate": 4.766097475768919e-05, "loss": 1.5707, "step": 780 }, { "epoch": 1.4, "grad_norm": 9.365654945373535, "learning_rate": 4.7601599204030544e-05, "loss": 1.3932, "step": 790 }, { "epoch": 1.42, "grad_norm": 3.3254847526550293, "learning_rate": 4.754151738948962e-05, "loss": 1.6041, "step": 800 }, { "epoch": 1.42, "eval_loss": 1.5639870166778564, "eval_runtime": 124.923, "eval_samples_per_second": 8.005, "eval_steps_per_second": 2.001, "step": 800 }, { "epoch": 1.44, "grad_norm": 3.520264148712158, "learning_rate": 4.7480731191519224e-05, "loss": 1.4991, "step": 810 }, { "epoch": 1.46, "grad_norm": 5.3987531661987305, "learning_rate": 4.741924250958289e-05, "loss": 1.6856, "step": 820 }, { "epoch": 1.48, "grad_norm": 12.352794647216797, "learning_rate": 4.7357053265095575e-05, "loss": 1.4509, "step": 830 }, { "epoch": 1.49, "grad_norm": 9.825531005859375, "learning_rate": 4.729416540136361e-05, "loss": 1.6168, "step": 840 }, { "epoch": 1.51, "grad_norm": 10.881526947021484, "learning_rate": 4.723058088352395e-05, "loss": 1.5783, "step": 850 }, { "epoch": 1.53, "grad_norm": 6.232407093048096, "learning_rate": 4.7166301698482815e-05, "loss": 1.4556, "step": 860 }, { "epoch": 1.55, "grad_norm": 3.3216302394866943, "learning_rate": 4.710132985485355e-05, "loss": 1.593, "step": 870 }, { "epoch": 1.56, "grad_norm": 5.219264984130859, "learning_rate": 4.703566738289389e-05, "loss": 1.5131, "step": 880 }, { "epoch": 1.58, "grad_norm": 7.875769138336182, "learning_rate": 4.696931633444251e-05, "loss": 1.5667, "step": 890 }, { "epoch": 1.6, "grad_norm": 5.77959680557251, "learning_rate": 4.69022787828549e-05, "loss": 1.5211, "step": 900 }, { "epoch": 1.6, "eval_loss": 1.5731443166732788, "eval_runtime": 124.8025, "eval_samples_per_second": 8.013, "eval_steps_per_second": 2.003, "step": 900 }, { "epoch": 1.62, "grad_norm": 4.806954383850098, "learning_rate": 4.683455682293863e-05, "loss": 1.6824, "step": 910 }, { "epoch": 1.64, "grad_norm": 5.980200290679932, "learning_rate": 4.676615257088776e-05, "loss": 1.5989, "step": 920 }, { "epoch": 1.65, "grad_norm": 4.3645429611206055, "learning_rate": 4.6697068164216896e-05, "loss": 1.6469, "step": 930 }, { "epoch": 1.67, "grad_norm": 3.2400012016296387, "learning_rate": 4.662730576169423e-05, "loss": 1.568, "step": 940 }, { "epoch": 1.69, "grad_norm": 4.331827640533447, "learning_rate": 4.6556867543274184e-05, "loss": 1.5236, "step": 950 }, { "epoch": 1.71, "grad_norm": 3.3798201084136963, "learning_rate": 4.6485755710029256e-05, "loss": 1.5046, "step": 960 }, { "epoch": 1.72, "grad_norm": 5.440864086151123, "learning_rate": 4.6413972484081216e-05, "loss": 1.5816, "step": 970 }, { "epoch": 1.74, "grad_norm": 5.852995872497559, "learning_rate": 4.6341520108531746e-05, "loss": 1.4193, "step": 980 }, { "epoch": 1.76, "grad_norm": 4.2782206535339355, "learning_rate": 4.626840084739224e-05, "loss": 1.5457, "step": 990 }, { "epoch": 1.78, "grad_norm": 8.631403923034668, "learning_rate": 4.619461698551315e-05, "loss": 1.652, "step": 1000 }, { "epoch": 1.78, "eval_loss": 1.5386379957199097, "eval_runtime": 124.8384, "eval_samples_per_second": 8.01, "eval_steps_per_second": 2.003, "step": 1000 }, { "epoch": 1.8, "grad_norm": 4.581122875213623, "learning_rate": 4.612017082851253e-05, "loss": 1.5746, "step": 1010 }, { "epoch": 1.81, "grad_norm": 3.0373165607452393, "learning_rate": 4.604506470270403e-05, "loss": 1.6038, "step": 1020 }, { "epoch": 1.83, "grad_norm": 3.5066914558410645, "learning_rate": 4.5969300955024167e-05, "loss": 1.5725, "step": 1030 }, { "epoch": 1.85, "grad_norm": 4.402235507965088, "learning_rate": 4.589288195295901e-05, "loss": 1.5469, "step": 1040 }, { "epoch": 1.87, "grad_norm": 4.844370365142822, "learning_rate": 4.58158100844702e-05, "loss": 1.5424, "step": 1050 }, { "epoch": 1.88, "grad_norm": 4.146657943725586, "learning_rate": 4.573808775792033e-05, "loss": 1.4878, "step": 1060 }, { "epoch": 1.9, "grad_norm": 3.210528612136841, "learning_rate": 4.5659717401997655e-05, "loss": 1.6077, "step": 1070 }, { "epoch": 1.92, "grad_norm": 5.2232818603515625, "learning_rate": 4.5580701465640254e-05, "loss": 1.4824, "step": 1080 }, { "epoch": 1.94, "grad_norm": 2.8741068840026855, "learning_rate": 4.550104241795946e-05, "loss": 1.6172, "step": 1090 }, { "epoch": 1.96, "grad_norm": 8.092519760131836, "learning_rate": 4.5420742748162734e-05, "loss": 1.3659, "step": 1100 }, { "epoch": 1.96, "eval_loss": 1.5198711156845093, "eval_runtime": 124.8546, "eval_samples_per_second": 8.009, "eval_steps_per_second": 2.002, "step": 1100 }, { "epoch": 1.97, "grad_norm": 5.068336009979248, "learning_rate": 4.5339804965475875e-05, "loss": 1.4661, "step": 1110 }, { "epoch": 1.99, "grad_norm": 13.167552947998047, "learning_rate": 4.525823159906459e-05, "loss": 1.411, "step": 1120 }, { "epoch": 2.01, "grad_norm": 4.712369918823242, "learning_rate": 4.5176025197955494e-05, "loss": 1.3309, "step": 1130 }, { "epoch": 2.03, "grad_norm": 7.261610507965088, "learning_rate": 4.509318833095642e-05, "loss": 1.3892, "step": 1140 }, { "epoch": 2.04, "grad_norm": 3.8006956577301025, "learning_rate": 4.500972358657618e-05, "loss": 1.3927, "step": 1150 }, { "epoch": 2.06, "grad_norm": 3.6301958560943604, "learning_rate": 4.492563357294369e-05, "loss": 1.4629, "step": 1160 }, { "epoch": 2.08, "grad_norm": 4.353027820587158, "learning_rate": 4.4840920917726426e-05, "loss": 1.352, "step": 1170 }, { "epoch": 2.1, "grad_norm": 3.375173807144165, "learning_rate": 4.475558826804833e-05, "loss": 1.4096, "step": 1180 }, { "epoch": 2.12, "grad_norm": 6.289668560028076, "learning_rate": 4.466963829040712e-05, "loss": 1.4834, "step": 1190 }, { "epoch": 2.13, "grad_norm": 4.517002582550049, "learning_rate": 4.458307367059092e-05, "loss": 1.4746, "step": 1200 }, { "epoch": 2.13, "eval_loss": 1.5145190954208374, "eval_runtime": 124.8898, "eval_samples_per_second": 8.007, "eval_steps_per_second": 2.002, "step": 1200 }, { "epoch": 2.15, "grad_norm": 3.195769786834717, "learning_rate": 4.449589711359438e-05, "loss": 1.4149, "step": 1210 }, { "epoch": 2.17, "grad_norm": 3.751405715942383, "learning_rate": 4.440811134353412e-05, "loss": 1.5501, "step": 1220 }, { "epoch": 2.19, "grad_norm": 4.148709774017334, "learning_rate": 4.431971910356363e-05, "loss": 1.5253, "step": 1230 }, { "epoch": 2.2, "grad_norm": 20.003253936767578, "learning_rate": 4.42307231557875e-05, "loss": 1.6413, "step": 1240 }, { "epoch": 2.22, "grad_norm": 4.721023082733154, "learning_rate": 4.414112628117517e-05, "loss": 1.5608, "step": 1250 }, { "epoch": 2.24, "grad_norm": 4.672358989715576, "learning_rate": 4.4050931279474015e-05, "loss": 1.3646, "step": 1260 }, { "epoch": 2.26, "grad_norm": 4.073034286499023, "learning_rate": 4.396014096912182e-05, "loss": 1.3499, "step": 1270 }, { "epoch": 2.28, "grad_norm": 3.2312991619110107, "learning_rate": 4.386875818715874e-05, "loss": 1.4648, "step": 1280 }, { "epoch": 2.29, "grad_norm": 18.92267417907715, "learning_rate": 4.3776785789138675e-05, "loss": 1.4919, "step": 1290 }, { "epoch": 2.31, "grad_norm": 5.677367687225342, "learning_rate": 4.368422664903997e-05, "loss": 1.2891, "step": 1300 }, { "epoch": 2.31, "eval_loss": 1.504623532295227, "eval_runtime": 124.8541, "eval_samples_per_second": 8.009, "eval_steps_per_second": 2.002, "step": 1300 }, { "epoch": 2.33, "grad_norm": 5.031940460205078, "learning_rate": 4.359108365917565e-05, "loss": 1.4939, "step": 1310 }, { "epoch": 2.35, "grad_norm": 7.701929092407227, "learning_rate": 4.349735973010305e-05, "loss": 1.28, "step": 1320 }, { "epoch": 2.36, "grad_norm": 5.7498040199279785, "learning_rate": 4.3403057790532855e-05, "loss": 1.4584, "step": 1330 }, { "epoch": 2.38, "grad_norm": 8.7277193069458, "learning_rate": 4.330818078723755e-05, "loss": 1.5871, "step": 1340 }, { "epoch": 2.4, "grad_norm": 13.915125846862793, "learning_rate": 4.32127316849594e-05, "loss": 1.3794, "step": 1350 }, { "epoch": 2.42, "grad_norm": 2.949733018875122, "learning_rate": 4.311671346631774e-05, "loss": 1.3543, "step": 1360 }, { "epoch": 2.44, "grad_norm": 5.377658843994141, "learning_rate": 4.302012913171584e-05, "loss": 1.3695, "step": 1370 }, { "epoch": 2.45, "grad_norm": 16.94107437133789, "learning_rate": 4.292298169924709e-05, "loss": 1.5168, "step": 1380 }, { "epoch": 2.47, "grad_norm": 4.190367221832275, "learning_rate": 4.282527420460072e-05, "loss": 1.4058, "step": 1390 }, { "epoch": 2.49, "grad_norm": 9.269573211669922, "learning_rate": 4.272700970096696e-05, "loss": 1.5794, "step": 1400 }, { "epoch": 2.49, "eval_loss": 1.498180627822876, "eval_runtime": 124.7222, "eval_samples_per_second": 8.018, "eval_steps_per_second": 2.004, "step": 1400 }, { "epoch": 2.51, "grad_norm": 3.951293468475342, "learning_rate": 4.262819125894156e-05, "loss": 1.56, "step": 1410 }, { "epoch": 2.52, "grad_norm": 3.8725697994232178, "learning_rate": 4.252882196642992e-05, "loss": 1.5159, "step": 1420 }, { "epoch": 2.54, "grad_norm": 3.898501396179199, "learning_rate": 4.242890492855056e-05, "loss": 1.4659, "step": 1430 }, { "epoch": 2.56, "grad_norm": 5.807662487030029, "learning_rate": 4.23284432675381e-05, "loss": 1.5736, "step": 1440 }, { "epoch": 2.58, "grad_norm": 3.529371500015259, "learning_rate": 4.222744012264566e-05, "loss": 1.5011, "step": 1450 }, { "epoch": 2.6, "grad_norm": 6.336548805236816, "learning_rate": 4.212589865004684e-05, "loss": 1.6629, "step": 1460 }, { "epoch": 2.61, "grad_norm": 6.222330093383789, "learning_rate": 4.2023822022737016e-05, "loss": 1.5573, "step": 1470 }, { "epoch": 2.63, "grad_norm": 4.25172233581543, "learning_rate": 4.192121343043424e-05, "loss": 1.3817, "step": 1480 }, { "epoch": 2.65, "grad_norm": 4.487111568450928, "learning_rate": 4.181807607947954e-05, "loss": 1.5323, "step": 1490 }, { "epoch": 2.67, "grad_norm": 4.656155109405518, "learning_rate": 4.1714413192736754e-05, "loss": 1.3678, "step": 1500 }, { "epoch": 2.67, "eval_loss": 1.5049968957901, "eval_runtime": 124.7803, "eval_samples_per_second": 8.014, "eval_steps_per_second": 2.004, "step": 1500 }, { "epoch": 2.68, "grad_norm": 4.431355953216553, "learning_rate": 4.161022800949177e-05, "loss": 1.486, "step": 1510 }, { "epoch": 2.7, "grad_norm": 18.211524963378906, "learning_rate": 4.150552378535137e-05, "loss": 1.4498, "step": 1520 }, { "epoch": 2.72, "grad_norm": 5.3755292892456055, "learning_rate": 4.140030379214147e-05, "loss": 1.4421, "step": 1530 }, { "epoch": 2.74, "grad_norm": 6.626212120056152, "learning_rate": 4.1294571317804854e-05, "loss": 1.4322, "step": 1540 }, { "epoch": 2.76, "grad_norm": 4.030793190002441, "learning_rate": 4.1188329666298464e-05, "loss": 1.3433, "step": 1550 }, { "epoch": 2.77, "grad_norm": 6.53309440612793, "learning_rate": 4.108158215749014e-05, "loss": 1.5604, "step": 1560 }, { "epoch": 2.79, "grad_norm": 3.76047420501709, "learning_rate": 4.0974332127054914e-05, "loss": 1.3259, "step": 1570 }, { "epoch": 2.81, "grad_norm": 4.58742094039917, "learning_rate": 4.0866582926370725e-05, "loss": 1.4228, "step": 1580 }, { "epoch": 2.83, "grad_norm": 4.566816806793213, "learning_rate": 4.0758337922413716e-05, "loss": 1.3013, "step": 1590 }, { "epoch": 2.84, "grad_norm": 6.218478202819824, "learning_rate": 4.064960049765304e-05, "loss": 1.5061, "step": 1600 }, { "epoch": 2.84, "eval_loss": 1.4853577613830566, "eval_runtime": 124.7889, "eval_samples_per_second": 8.014, "eval_steps_per_second": 2.003, "step": 1600 }, { "epoch": 2.86, "grad_norm": 13.811309814453125, "learning_rate": 4.054037404994516e-05, "loss": 1.4839, "step": 1610 }, { "epoch": 2.88, "grad_norm": 5.560975074768066, "learning_rate": 4.043066199242762e-05, "loss": 1.4765, "step": 1620 }, { "epoch": 2.9, "grad_norm": 35.27302551269531, "learning_rate": 4.032046775341247e-05, "loss": 1.4105, "step": 1630 }, { "epoch": 2.92, "grad_norm": 4.9896745681762695, "learning_rate": 4.020979477627907e-05, "loss": 1.5688, "step": 1640 }, { "epoch": 2.93, "grad_norm": 3.5250892639160156, "learning_rate": 4.0098646519366534e-05, "loss": 1.4484, "step": 1650 }, { "epoch": 2.95, "grad_norm": 5.281729698181152, "learning_rate": 3.998702645586565e-05, "loss": 1.6017, "step": 1660 }, { "epoch": 2.97, "grad_norm": 4.667525768280029, "learning_rate": 3.9874938073710336e-05, "loss": 1.5006, "step": 1670 }, { "epoch": 2.99, "grad_norm": 4.294438362121582, "learning_rate": 3.976238487546864e-05, "loss": 1.4218, "step": 1680 }, { "epoch": 3.0, "grad_norm": 4.070734977722168, "learning_rate": 3.9649370378233365e-05, "loss": 1.6569, "step": 1690 }, { "epoch": 3.02, "grad_norm": 4.640359878540039, "learning_rate": 3.953589811351204e-05, "loss": 1.5635, "step": 1700 }, { "epoch": 3.02, "eval_loss": 1.478974461555481, "eval_runtime": 124.6852, "eval_samples_per_second": 8.02, "eval_steps_per_second": 2.005, "step": 1700 }, { "epoch": 3.04, "grad_norm": 4.43009090423584, "learning_rate": 3.94219716271167e-05, "loss": 1.3304, "step": 1710 }, { "epoch": 3.06, "grad_norm": 4.001712799072266, "learning_rate": 3.930759447905298e-05, "loss": 1.3534, "step": 1720 }, { "epoch": 3.08, "grad_norm": 4.664085388183594, "learning_rate": 3.919277024340891e-05, "loss": 1.368, "step": 1730 }, { "epoch": 3.09, "grad_norm": 4.42681360244751, "learning_rate": 3.907750250824327e-05, "loss": 1.4164, "step": 1740 }, { "epoch": 3.11, "grad_norm": 7.331808567047119, "learning_rate": 3.8961794875473394e-05, "loss": 1.4333, "step": 1750 }, { "epoch": 3.13, "grad_norm": 5.612239837646484, "learning_rate": 3.884565096076269e-05, "loss": 1.5754, "step": 1760 }, { "epoch": 3.15, "grad_norm": 5.236481666564941, "learning_rate": 3.872907439340758e-05, "loss": 1.4017, "step": 1770 }, { "epoch": 3.16, "grad_norm": 4.995403289794922, "learning_rate": 3.861206881622419e-05, "loss": 1.5011, "step": 1780 }, { "epoch": 3.18, "grad_norm": 41.0167236328125, "learning_rate": 3.8494637885434396e-05, "loss": 1.4472, "step": 1790 }, { "epoch": 3.2, "grad_norm": 5.136650562286377, "learning_rate": 3.837678527055168e-05, "loss": 1.3939, "step": 1800 }, { "epoch": 3.2, "eval_loss": 1.4779504537582397, "eval_runtime": 124.6728, "eval_samples_per_second": 8.021, "eval_steps_per_second": 2.005, "step": 1800 }, { "epoch": 3.22, "grad_norm": 5.178096294403076, "learning_rate": 3.8258514654266434e-05, "loss": 1.5265, "step": 1810 }, { "epoch": 3.24, "grad_norm": 6.949739456176758, "learning_rate": 3.813982973233083e-05, "loss": 1.3674, "step": 1820 }, { "epoch": 3.25, "grad_norm": 3.84801983833313, "learning_rate": 3.802073421344339e-05, "loss": 1.4305, "step": 1830 }, { "epoch": 3.27, "grad_norm": 3.5803613662719727, "learning_rate": 3.7901231819133105e-05, "loss": 1.557, "step": 1840 }, { "epoch": 3.29, "grad_norm": 3.509099245071411, "learning_rate": 3.7781326283643085e-05, "loss": 1.3611, "step": 1850 }, { "epoch": 3.31, "grad_norm": 41.185279846191406, "learning_rate": 3.766102135381393e-05, "loss": 1.3944, "step": 1860 }, { "epoch": 3.32, "grad_norm": 3.797672748565674, "learning_rate": 3.75403207889666e-05, "loss": 1.2557, "step": 1870 }, { "epoch": 3.34, "grad_norm": 4.237602233886719, "learning_rate": 3.741922836078499e-05, "loss": 1.3583, "step": 1880 }, { "epoch": 3.36, "grad_norm": 4.5037312507629395, "learning_rate": 3.729774785319801e-05, "loss": 1.4619, "step": 1890 }, { "epoch": 3.38, "grad_norm": 4.292742729187012, "learning_rate": 3.717588306226143e-05, "loss": 1.3986, "step": 1900 }, { "epoch": 3.38, "eval_loss": 1.4685039520263672, "eval_runtime": 124.7126, "eval_samples_per_second": 8.018, "eval_steps_per_second": 2.005, "step": 1900 }, { "epoch": 3.4, "grad_norm": 5.198277950286865, "learning_rate": 3.705363779603917e-05, "loss": 1.349, "step": 1910 }, { "epoch": 3.41, "grad_norm": 3.86722469329834, "learning_rate": 3.693101587448436e-05, "loss": 1.5053, "step": 1920 }, { "epoch": 3.43, "grad_norm": 8.68099594116211, "learning_rate": 3.680802112931996e-05, "loss": 1.3899, "step": 1930 }, { "epoch": 3.45, "grad_norm": 3.8347198963165283, "learning_rate": 3.6684657403919005e-05, "loss": 1.4519, "step": 1940 }, { "epoch": 3.47, "grad_norm": 9.875212669372559, "learning_rate": 3.6560928553184554e-05, "loss": 1.4788, "step": 1950 }, { "epoch": 3.48, "grad_norm": 8.638535499572754, "learning_rate": 3.6436838443429175e-05, "loss": 1.3777, "step": 1960 }, { "epoch": 3.5, "grad_norm": 3.73545503616333, "learning_rate": 3.631239095225417e-05, "loss": 1.4962, "step": 1970 }, { "epoch": 3.52, "grad_norm": 8.485962867736816, "learning_rate": 3.618758996842839e-05, "loss": 1.4377, "step": 1980 }, { "epoch": 3.54, "grad_norm": 4.3264055252075195, "learning_rate": 3.60624393917667e-05, "loss": 1.5695, "step": 1990 }, { "epoch": 3.56, "grad_norm": 3.979128837585449, "learning_rate": 3.5936943133008183e-05, "loss": 1.2959, "step": 2000 }, { "epoch": 3.56, "eval_loss": 1.4598569869995117, "eval_runtime": 124.8036, "eval_samples_per_second": 8.013, "eval_steps_per_second": 2.003, "step": 2000 }, { "epoch": 3.57, "grad_norm": 7.0864338874816895, "learning_rate": 3.581110511369384e-05, "loss": 1.4301, "step": 2010 }, { "epoch": 3.59, "grad_norm": 4.597893714904785, "learning_rate": 3.568492926604412e-05, "loss": 1.2962, "step": 2020 }, { "epoch": 3.61, "grad_norm": 3.7413573265075684, "learning_rate": 3.555841953283603e-05, "loss": 1.3708, "step": 2030 }, { "epoch": 3.63, "grad_norm": 4.206364631652832, "learning_rate": 3.5431579867279905e-05, "loss": 1.4758, "step": 2040 }, { "epoch": 3.64, "grad_norm": 5.203850269317627, "learning_rate": 3.530441423289591e-05, "loss": 1.4563, "step": 2050 }, { "epoch": 3.66, "grad_norm": 3.0671565532684326, "learning_rate": 3.517692660339018e-05, "loss": 1.322, "step": 2060 }, { "epoch": 3.68, "grad_norm": 4.655951499938965, "learning_rate": 3.504912096253061e-05, "loss": 1.4868, "step": 2070 }, { "epoch": 3.7, "grad_norm": 3.5286195278167725, "learning_rate": 3.492100130402242e-05, "loss": 1.3379, "step": 2080 }, { "epoch": 3.72, "grad_norm": 3.526078701019287, "learning_rate": 3.479257163138334e-05, "loss": 1.4864, "step": 2090 }, { "epoch": 3.73, "grad_norm": 4.072696208953857, "learning_rate": 3.4663835957818515e-05, "loss": 1.4187, "step": 2100 }, { "epoch": 3.73, "eval_loss": 1.4545879364013672, "eval_runtime": 124.6694, "eval_samples_per_second": 8.021, "eval_steps_per_second": 2.005, "step": 2100 } ], "logging_steps": 10, "max_steps": 5620, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 1.6370752708179395e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }