{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.966777408637874, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006644518272425249, "grad_norm": 324.0, "learning_rate": 1.3333333333333334e-06, "loss": 34.1539, "step": 1 }, { "epoch": 0.03322259136212625, "grad_norm": 328.0, "learning_rate": 6.666666666666667e-06, "loss": 34.4732, "step": 5 }, { "epoch": 0.0664451827242525, "grad_norm": 132.0, "learning_rate": 1.3333333333333333e-05, "loss": 30.9731, "step": 10 }, { "epoch": 0.09966777408637874, "grad_norm": 57.75, "learning_rate": 2e-05, "loss": 24.1357, "step": 15 }, { "epoch": 0.132890365448505, "grad_norm": 19.125, "learning_rate": 2.6666666666666667e-05, "loss": 19.6743, "step": 20 }, { "epoch": 0.16611295681063123, "grad_norm": 14.1875, "learning_rate": 3.3333333333333335e-05, "loss": 17.9465, "step": 25 }, { "epoch": 0.19933554817275748, "grad_norm": 7.25, "learning_rate": 4e-05, "loss": 15.9561, "step": 30 }, { "epoch": 0.23255813953488372, "grad_norm": 4.125, "learning_rate": 4.666666666666667e-05, "loss": 14.7788, "step": 35 }, { "epoch": 0.26578073089701, "grad_norm": 3.484375, "learning_rate": 5.333333333333333e-05, "loss": 14.139, "step": 40 }, { "epoch": 0.29900332225913623, "grad_norm": 4.75, "learning_rate": 6e-05, "loss": 13.5886, "step": 45 }, { "epoch": 0.33222591362126247, "grad_norm": 6.125, "learning_rate": 6.666666666666667e-05, "loss": 13.0275, "step": 50 }, { "epoch": 0.3654485049833887, "grad_norm": 11.5625, "learning_rate": 7.333333333333333e-05, "loss": 11.9071, "step": 55 }, { "epoch": 0.39867109634551495, "grad_norm": 18.375, "learning_rate": 8e-05, "loss": 9.4575, "step": 60 }, { "epoch": 0.4318936877076412, "grad_norm": 21.25, "learning_rate": 8.666666666666667e-05, "loss": 5.8479, "step": 65 }, { "epoch": 0.46511627906976744, "grad_norm": 5.09375, "learning_rate": 9.333333333333334e-05, "loss": 2.6937, "step": 70 }, { "epoch": 0.4983388704318937, "grad_norm": 6.46875, "learning_rate": 0.0001, "loss": 2.0051, "step": 75 }, { "epoch": 0.53156146179402, "grad_norm": 2.78125, "learning_rate": 0.00010666666666666667, "loss": 1.7309, "step": 80 }, { "epoch": 0.5647840531561462, "grad_norm": 0.73828125, "learning_rate": 0.00011333333333333334, "loss": 1.5823, "step": 85 }, { "epoch": 0.5980066445182725, "grad_norm": 1.359375, "learning_rate": 0.00012, "loss": 1.4702, "step": 90 }, { "epoch": 0.6312292358803987, "grad_norm": 0.94140625, "learning_rate": 0.00012666666666666666, "loss": 1.3996, "step": 95 }, { "epoch": 0.6644518272425249, "grad_norm": 0.859375, "learning_rate": 0.00013333333333333334, "loss": 1.3389, "step": 100 }, { "epoch": 0.6976744186046512, "grad_norm": 1.0390625, "learning_rate": 0.00014, "loss": 1.293, "step": 105 }, { "epoch": 0.7308970099667774, "grad_norm": 1.2265625, "learning_rate": 0.00014666666666666666, "loss": 1.2656, "step": 110 }, { "epoch": 0.7641196013289037, "grad_norm": 0.5703125, "learning_rate": 0.00015333333333333334, "loss": 1.2254, "step": 115 }, { "epoch": 0.7973421926910299, "grad_norm": 1.0546875, "learning_rate": 0.00016, "loss": 1.2072, "step": 120 }, { "epoch": 0.8305647840531561, "grad_norm": 1.28125, "learning_rate": 0.0001666666666666667, "loss": 1.1856, "step": 125 }, { "epoch": 0.8637873754152824, "grad_norm": 1.3046875, "learning_rate": 0.00017333333333333334, "loss": 1.169, "step": 130 }, { "epoch": 0.8970099667774086, "grad_norm": 1.3125, "learning_rate": 0.00018, "loss": 1.1497, "step": 135 }, { "epoch": 0.9302325581395349, "grad_norm": 1.9765625, "learning_rate": 0.0001866666666666667, "loss": 1.131, "step": 140 }, { "epoch": 0.9634551495016611, "grad_norm": 1.609375, "learning_rate": 0.00019333333333333333, "loss": 1.1275, "step": 145 }, { "epoch": 0.9966777408637874, "grad_norm": 6.65625, "learning_rate": 0.0002, "loss": 1.1216, "step": 150 }, { "epoch": 0.9966777408637874, "eval_loss": 2.580465793609619, "eval_runtime": 0.2799, "eval_samples_per_second": 35.728, "eval_steps_per_second": 3.573, "step": 150 }, { "epoch": 1.0299003322259137, "grad_norm": 1.1171875, "learning_rate": 0.00019999323080037624, "loss": 1.1202, "step": 155 }, { "epoch": 1.06312292358804, "grad_norm": 7.21875, "learning_rate": 0.00019997292411794618, "loss": 1.0982, "step": 160 }, { "epoch": 1.0963455149501662, "grad_norm": 0.6875, "learning_rate": 0.0001999390827019096, "loss": 1.1059, "step": 165 }, { "epoch": 1.1295681063122924, "grad_norm": 0.74609375, "learning_rate": 0.0001998917111338525, "loss": 1.079, "step": 170 }, { "epoch": 1.1627906976744187, "grad_norm": 1.203125, "learning_rate": 0.00019983081582712685, "loss": 1.0626, "step": 175 }, { "epoch": 1.196013289036545, "grad_norm": 3.75, "learning_rate": 0.00019975640502598244, "loss": 1.0644, "step": 180 }, { "epoch": 1.2292358803986712, "grad_norm": 0.85546875, "learning_rate": 0.00019966848880445062, "loss": 1.064, "step": 185 }, { "epoch": 1.2624584717607974, "grad_norm": 1.1796875, "learning_rate": 0.00019956707906498044, "loss": 1.0638, "step": 190 }, { "epoch": 1.2956810631229236, "grad_norm": 1.75, "learning_rate": 0.00019945218953682734, "loss": 1.0598, "step": 195 }, { "epoch": 1.3289036544850499, "grad_norm": 1.2734375, "learning_rate": 0.00019932383577419432, "loss": 1.0433, "step": 200 }, { "epoch": 1.3621262458471761, "grad_norm": 1.1171875, "learning_rate": 0.00019918203515412617, "loss": 1.0375, "step": 205 }, { "epoch": 1.3953488372093024, "grad_norm": 1.1171875, "learning_rate": 0.00019902680687415705, "loss": 1.0293, "step": 210 }, { "epoch": 1.4285714285714286, "grad_norm": 1.1640625, "learning_rate": 0.00019885817194971117, "loss": 1.0196, "step": 215 }, { "epoch": 1.4617940199335548, "grad_norm": 1.3828125, "learning_rate": 0.00019867615321125795, "loss": 1.0227, "step": 220 }, { "epoch": 1.495016611295681, "grad_norm": 2.703125, "learning_rate": 0.00019848077530122083, "loss": 1.0192, "step": 225 }, { "epoch": 1.5282392026578073, "grad_norm": 2.90625, "learning_rate": 0.00019827206467064133, "loss": 1.0254, "step": 230 }, { "epoch": 1.5614617940199336, "grad_norm": 1.90625, "learning_rate": 0.00019805004957559793, "loss": 1.0076, "step": 235 }, { "epoch": 1.5946843853820598, "grad_norm": 1.2578125, "learning_rate": 0.00019781476007338058, "loss": 0.9979, "step": 240 }, { "epoch": 1.627906976744186, "grad_norm": 4.1875, "learning_rate": 0.00019756622801842143, "loss": 0.9963, "step": 245 }, { "epoch": 1.6611295681063123, "grad_norm": 2.625, "learning_rate": 0.00019730448705798239, "loss": 1.0017, "step": 250 }, { "epoch": 1.6943521594684385, "grad_norm": 2.9375, "learning_rate": 0.00019702957262759965, "loss": 1.0055, "step": 255 }, { "epoch": 1.7275747508305648, "grad_norm": 2.40625, "learning_rate": 0.00019674152194628638, "loss": 0.993, "step": 260 }, { "epoch": 1.760797342192691, "grad_norm": 1.3046875, "learning_rate": 0.0001964403740114939, "loss": 0.9875, "step": 265 }, { "epoch": 1.7940199335548173, "grad_norm": 1.2734375, "learning_rate": 0.0001961261695938319, "loss": 1.0015, "step": 270 }, { "epoch": 1.8272425249169435, "grad_norm": 1.0, "learning_rate": 0.0001957989512315489, "loss": 0.9879, "step": 275 }, { "epoch": 1.8604651162790697, "grad_norm": 1.8828125, "learning_rate": 0.0001954587632247732, "loss": 0.9846, "step": 280 }, { "epoch": 1.893687707641196, "grad_norm": 1.09375, "learning_rate": 0.00019510565162951537, "loss": 0.9816, "step": 285 }, { "epoch": 1.9269102990033222, "grad_norm": 1.15625, "learning_rate": 0.00019473966425143292, "loss": 0.9832, "step": 290 }, { "epoch": 1.9601328903654485, "grad_norm": 1.3359375, "learning_rate": 0.00019436085063935835, "loss": 0.9838, "step": 295 }, { "epoch": 1.9933554817275747, "grad_norm": 0.76171875, "learning_rate": 0.00019396926207859084, "loss": 0.9828, "step": 300 }, { "epoch": 2.0, "eval_loss": 2.516935110092163, "eval_runtime": 0.2355, "eval_samples_per_second": 42.456, "eval_steps_per_second": 4.246, "step": 301 }, { "epoch": 2.026578073089701, "grad_norm": 1.765625, "learning_rate": 0.00019356495158395315, "loss": 0.9602, "step": 305 }, { "epoch": 2.0598006644518274, "grad_norm": 3.375, "learning_rate": 0.00019314797389261424, "loss": 0.9484, "step": 310 }, { "epoch": 2.0930232558139537, "grad_norm": 0.54296875, "learning_rate": 0.00019271838545667876, "loss": 0.9496, "step": 315 }, { "epoch": 2.12624584717608, "grad_norm": 0.80859375, "learning_rate": 0.00019227624443554425, "loss": 0.9405, "step": 320 }, { "epoch": 2.159468438538206, "grad_norm": 1.4765625, "learning_rate": 0.00019182161068802741, "loss": 0.9509, "step": 325 }, { "epoch": 2.1926910299003324, "grad_norm": 1.3515625, "learning_rate": 0.0001913545457642601, "loss": 0.9532, "step": 330 }, { "epoch": 2.2259136212624586, "grad_norm": 1.0234375, "learning_rate": 0.00019087511289735644, "loss": 0.9421, "step": 335 }, { "epoch": 2.259136212624585, "grad_norm": 3.453125, "learning_rate": 0.00019038337699485208, "loss": 0.9347, "step": 340 }, { "epoch": 2.292358803986711, "grad_norm": 1.265625, "learning_rate": 0.0001898794046299167, "loss": 0.9451, "step": 345 }, { "epoch": 2.3255813953488373, "grad_norm": 5.25, "learning_rate": 0.00018936326403234125, "loss": 0.9503, "step": 350 }, { "epoch": 2.3588039867109636, "grad_norm": 1.2421875, "learning_rate": 0.00018883502507930042, "loss": 0.9515, "step": 355 }, { "epoch": 2.39202657807309, "grad_norm": 1.4375, "learning_rate": 0.00018829475928589271, "loss": 0.9382, "step": 360 }, { "epoch": 2.425249169435216, "grad_norm": 0.82421875, "learning_rate": 0.0001877425397954582, "loss": 0.9309, "step": 365 }, { "epoch": 2.4584717607973423, "grad_norm": 1.578125, "learning_rate": 0.00018717844136967624, "loss": 0.9487, "step": 370 }, { "epoch": 2.4916943521594686, "grad_norm": 1.3359375, "learning_rate": 0.00018660254037844388, "loss": 0.9414, "step": 375 }, { "epoch": 2.524916943521595, "grad_norm": 1.3125, "learning_rate": 0.00018601491478953657, "loss": 0.9575, "step": 380 }, { "epoch": 2.558139534883721, "grad_norm": 1.90625, "learning_rate": 0.00018541564415805258, "loss": 0.9469, "step": 385 }, { "epoch": 2.5913621262458473, "grad_norm": 8.25, "learning_rate": 0.0001848048096156426, "loss": 0.9246, "step": 390 }, { "epoch": 2.6245847176079735, "grad_norm": 0.921875, "learning_rate": 0.00018418249385952575, "loss": 0.9357, "step": 395 }, { "epoch": 2.6578073089700998, "grad_norm": 1.59375, "learning_rate": 0.00018354878114129367, "loss": 0.9264, "step": 400 }, { "epoch": 2.691029900332226, "grad_norm": 2.125, "learning_rate": 0.00018290375725550417, "loss": 0.934, "step": 405 }, { "epoch": 2.7242524916943522, "grad_norm": 5.15625, "learning_rate": 0.00018224750952806624, "loss": 0.9378, "step": 410 }, { "epoch": 2.7574750830564785, "grad_norm": 0.66796875, "learning_rate": 0.00018158012680441723, "loss": 0.9325, "step": 415 }, { "epoch": 2.7906976744186047, "grad_norm": 1.109375, "learning_rate": 0.00018090169943749476, "loss": 0.9343, "step": 420 }, { "epoch": 2.823920265780731, "grad_norm": 0.68359375, "learning_rate": 0.0001802123192755044, "loss": 0.9322, "step": 425 }, { "epoch": 2.857142857142857, "grad_norm": 1.25, "learning_rate": 0.0001795120796494848, "loss": 0.9203, "step": 430 }, { "epoch": 2.8903654485049834, "grad_norm": 0.67578125, "learning_rate": 0.00017880107536067218, "loss": 0.9181, "step": 435 }, { "epoch": 2.9235880398671097, "grad_norm": 0.66796875, "learning_rate": 0.00017807940266766593, "loss": 0.9152, "step": 440 }, { "epoch": 2.956810631229236, "grad_norm": 0.57421875, "learning_rate": 0.0001773471592733964, "loss": 0.9193, "step": 445 }, { "epoch": 2.990033222591362, "grad_norm": 0.69140625, "learning_rate": 0.0001766044443118978, "loss": 0.9157, "step": 450 }, { "epoch": 2.9966777408637872, "eval_loss": 2.4835643768310547, "eval_runtime": 0.2608, "eval_samples_per_second": 38.338, "eval_steps_per_second": 3.834, "step": 451 }, { "epoch": 3.0232558139534884, "grad_norm": 1.390625, "learning_rate": 0.00017585135833488692, "loss": 0.9023, "step": 455 }, { "epoch": 3.0564784053156147, "grad_norm": 1.5078125, "learning_rate": 0.00017508800329814995, "loss": 0.8957, "step": 460 }, { "epoch": 3.089700996677741, "grad_norm": 1.75, "learning_rate": 0.00017431448254773944, "loss": 0.8963, "step": 465 }, { "epoch": 3.122923588039867, "grad_norm": 1.4921875, "learning_rate": 0.0001735309008059829, "loss": 0.8938, "step": 470 }, { "epoch": 3.1561461794019934, "grad_norm": 1.1484375, "learning_rate": 0.00017273736415730488, "loss": 0.8832, "step": 475 }, { "epoch": 3.1893687707641196, "grad_norm": 0.734375, "learning_rate": 0.0001719339800338651, "loss": 0.8824, "step": 480 }, { "epoch": 3.222591362126246, "grad_norm": 0.92578125, "learning_rate": 0.00017112085720101373, "loss": 0.8985, "step": 485 }, { "epoch": 3.255813953488372, "grad_norm": 0.77734375, "learning_rate": 0.0001702981057425662, "loss": 0.8915, "step": 490 }, { "epoch": 3.2890365448504983, "grad_norm": 1.0703125, "learning_rate": 0.00016946583704589973, "loss": 0.8959, "step": 495 }, { "epoch": 3.3222591362126246, "grad_norm": 0.640625, "learning_rate": 0.0001686241637868734, "loss": 0.8932, "step": 500 }, { "epoch": 3.355481727574751, "grad_norm": 0.875, "learning_rate": 0.00016777319991457325, "loss": 0.9034, "step": 505 }, { "epoch": 3.388704318936877, "grad_norm": 1.03125, "learning_rate": 0.00016691306063588583, "loss": 0.8914, "step": 510 }, { "epoch": 3.4219269102990033, "grad_norm": 1.0078125, "learning_rate": 0.00016604386239990078, "loss": 0.8968, "step": 515 }, { "epoch": 3.4551495016611296, "grad_norm": 0.7109375, "learning_rate": 0.00016516572288214552, "loss": 0.8899, "step": 520 }, { "epoch": 3.488372093023256, "grad_norm": 0.55078125, "learning_rate": 0.00016427876096865394, "loss": 0.888, "step": 525 }, { "epoch": 3.521594684385382, "grad_norm": 1.5703125, "learning_rate": 0.00016338309673987101, "loss": 0.8966, "step": 530 }, { "epoch": 3.5548172757475083, "grad_norm": 0.7890625, "learning_rate": 0.000162478851454396, "loss": 0.8802, "step": 535 }, { "epoch": 3.5880398671096345, "grad_norm": 0.63671875, "learning_rate": 0.0001615661475325658, "loss": 0.8864, "step": 540 }, { "epoch": 3.6212624584717608, "grad_norm": 1.3359375, "learning_rate": 0.00016064510853988138, "loss": 0.8816, "step": 545 }, { "epoch": 3.654485049833887, "grad_norm": 1.484375, "learning_rate": 0.00015971585917027862, "loss": 0.8906, "step": 550 }, { "epoch": 3.6877076411960132, "grad_norm": 1.09375, "learning_rate": 0.00015877852522924732, "loss": 0.8896, "step": 555 }, { "epoch": 3.7209302325581395, "grad_norm": 0.73046875, "learning_rate": 0.00015783323361679864, "loss": 0.8806, "step": 560 }, { "epoch": 3.7541528239202657, "grad_norm": 1.25, "learning_rate": 0.00015688011231028518, "loss": 0.8758, "step": 565 }, { "epoch": 3.787375415282392, "grad_norm": 1.2109375, "learning_rate": 0.0001559192903470747, "loss": 0.871, "step": 570 }, { "epoch": 3.820598006644518, "grad_norm": 0.7578125, "learning_rate": 0.0001549508978070806, "loss": 0.8882, "step": 575 }, { "epoch": 3.8538205980066444, "grad_norm": 0.66015625, "learning_rate": 0.0001539750657951513, "loss": 0.8719, "step": 580 }, { "epoch": 3.8870431893687707, "grad_norm": 0.58203125, "learning_rate": 0.0001529919264233205, "loss": 0.8794, "step": 585 }, { "epoch": 3.920265780730897, "grad_norm": 0.82421875, "learning_rate": 0.00015200161279292155, "loss": 0.8787, "step": 590 }, { "epoch": 3.953488372093023, "grad_norm": 0.8125, "learning_rate": 0.00015100425897656753, "loss": 0.873, "step": 595 }, { "epoch": 3.9867109634551494, "grad_norm": 0.578125, "learning_rate": 0.00015000000000000001, "loss": 0.8753, "step": 600 }, { "epoch": 4.0, "eval_loss": 2.5010673999786377, "eval_runtime": 0.239, "eval_samples_per_second": 41.842, "eval_steps_per_second": 4.184, "step": 602 }, { "epoch": 4.019933554817276, "grad_norm": 1.359375, "learning_rate": 0.0001489889718238087, "loss": 0.8697, "step": 605 }, { "epoch": 4.053156146179402, "grad_norm": 0.9921875, "learning_rate": 0.00014797131132502465, "loss": 0.8496, "step": 610 }, { "epoch": 4.086378737541528, "grad_norm": 1.765625, "learning_rate": 0.00014694715627858908, "loss": 0.8601, "step": 615 }, { "epoch": 4.119601328903655, "grad_norm": 1.140625, "learning_rate": 0.00014591664533870118, "loss": 0.8647, "step": 620 }, { "epoch": 4.152823920265781, "grad_norm": 1.140625, "learning_rate": 0.00014487991802004623, "loss": 0.8541, "step": 625 }, { "epoch": 4.186046511627907, "grad_norm": 0.6015625, "learning_rate": 0.00014383711467890774, "loss": 0.8481, "step": 630 }, { "epoch": 4.219269102990033, "grad_norm": 0.96875, "learning_rate": 0.00014278837649416544, "loss": 0.8514, "step": 635 }, { "epoch": 4.25249169435216, "grad_norm": 0.79296875, "learning_rate": 0.0001417338454481818, "loss": 0.8498, "step": 640 }, { "epoch": 4.285714285714286, "grad_norm": 0.734375, "learning_rate": 0.00014067366430758004, "loss": 0.8368, "step": 645 }, { "epoch": 4.318936877076412, "grad_norm": 0.875, "learning_rate": 0.0001396079766039157, "loss": 0.8439, "step": 650 }, { "epoch": 4.352159468438538, "grad_norm": 0.67578125, "learning_rate": 0.00013853692661424484, "loss": 0.8565, "step": 655 }, { "epoch": 4.385382059800665, "grad_norm": 0.79296875, "learning_rate": 0.00013746065934159123, "loss": 0.8426, "step": 660 }, { "epoch": 4.4186046511627906, "grad_norm": 0.6953125, "learning_rate": 0.00013637932049531516, "loss": 0.8471, "step": 665 }, { "epoch": 4.451827242524917, "grad_norm": 1.28125, "learning_rate": 0.00013529305647138687, "loss": 0.8417, "step": 670 }, { "epoch": 4.485049833887043, "grad_norm": 0.94140625, "learning_rate": 0.00013420201433256689, "loss": 0.8493, "step": 675 }, { "epoch": 4.51827242524917, "grad_norm": 1.59375, "learning_rate": 0.0001331063417884958, "loss": 0.8506, "step": 680 }, { "epoch": 4.5514950166112955, "grad_norm": 0.97265625, "learning_rate": 0.00013200618717569714, "loss": 0.841, "step": 685 }, { "epoch": 4.584717607973422, "grad_norm": 0.75, "learning_rate": 0.00013090169943749476, "loss": 0.8415, "step": 690 }, { "epoch": 4.617940199335548, "grad_norm": 0.703125, "learning_rate": 0.0001297930281038482, "loss": 0.8506, "step": 695 }, { "epoch": 4.651162790697675, "grad_norm": 0.8359375, "learning_rate": 0.00012868032327110904, "loss": 0.8425, "step": 700 }, { "epoch": 4.6843853820598005, "grad_norm": 1.1953125, "learning_rate": 0.0001275637355816999, "loss": 0.8466, "step": 705 }, { "epoch": 4.717607973421927, "grad_norm": 0.51171875, "learning_rate": 0.00012644341620372023, "loss": 0.841, "step": 710 }, { "epoch": 4.750830564784053, "grad_norm": 1.7578125, "learning_rate": 0.0001253195168104802, "loss": 0.8396, "step": 715 }, { "epoch": 4.78405315614618, "grad_norm": 1.609375, "learning_rate": 0.00012419218955996676, "loss": 0.8423, "step": 720 }, { "epoch": 4.8172757475083055, "grad_norm": 0.86328125, "learning_rate": 0.00012306158707424403, "loss": 0.839, "step": 725 }, { "epoch": 4.850498338870432, "grad_norm": 0.75, "learning_rate": 0.00012192786241879033, "loss": 0.8342, "step": 730 }, { "epoch": 4.883720930232558, "grad_norm": 1.1328125, "learning_rate": 0.00012079116908177593, "loss": 0.8358, "step": 735 }, { "epoch": 4.916943521594685, "grad_norm": 1.078125, "learning_rate": 0.00011965166095328301, "loss": 0.8432, "step": 740 }, { "epoch": 4.95016611295681, "grad_norm": 0.68359375, "learning_rate": 0.00011850949230447145, "loss": 0.8368, "step": 745 }, { "epoch": 4.983388704318937, "grad_norm": 1.0078125, "learning_rate": 0.00011736481776669306, "loss": 0.8334, "step": 750 }, { "epoch": 4.996677740863787, "eval_loss": 2.4944658279418945, "eval_runtime": 0.2592, "eval_samples_per_second": 38.58, "eval_steps_per_second": 3.858, "step": 752 }, { "epoch": 5.016611295681063, "grad_norm": 0.7265625, "learning_rate": 0.00011621779231055676, "loss": 0.8264, "step": 755 }, { "epoch": 5.04983388704319, "grad_norm": 1.6484375, "learning_rate": 0.00011506857122494831, "loss": 0.8175, "step": 760 }, { "epoch": 5.083056478405315, "grad_norm": 0.8828125, "learning_rate": 0.00011391731009600654, "loss": 0.8207, "step": 765 }, { "epoch": 5.116279069767442, "grad_norm": 0.8359375, "learning_rate": 0.00011276416478605949, "loss": 0.8134, "step": 770 }, { "epoch": 5.149501661129568, "grad_norm": 0.8828125, "learning_rate": 0.00011160929141252303, "loss": 0.8146, "step": 775 }, { "epoch": 5.1827242524916945, "grad_norm": 1.140625, "learning_rate": 0.00011045284632676536, "loss": 0.8118, "step": 780 }, { "epoch": 5.21594684385382, "grad_norm": 3.4375, "learning_rate": 0.00010929498609293924, "loss": 0.8142, "step": 785 }, { "epoch": 5.249169435215947, "grad_norm": 1.015625, "learning_rate": 0.00010813586746678583, "loss": 0.8156, "step": 790 }, { "epoch": 5.282392026578073, "grad_norm": 2.828125, "learning_rate": 0.00010697564737441252, "loss": 0.8097, "step": 795 }, { "epoch": 5.3156146179401995, "grad_norm": 0.8828125, "learning_rate": 0.00010581448289104758, "loss": 0.8213, "step": 800 }, { "epoch": 5.348837209302325, "grad_norm": 1.3984375, "learning_rate": 0.0001046525312197747, "loss": 0.8087, "step": 805 }, { "epoch": 5.382059800664452, "grad_norm": 1.0625, "learning_rate": 0.00010348994967025012, "loss": 0.8046, "step": 810 }, { "epoch": 5.415282392026578, "grad_norm": 2.734375, "learning_rate": 0.00010232689563740563, "loss": 0.8086, "step": 815 }, { "epoch": 5.4485049833887045, "grad_norm": 0.9921875, "learning_rate": 0.00010116352658013973, "loss": 0.809, "step": 820 }, { "epoch": 5.48172757475083, "grad_norm": 1.0, "learning_rate": 0.0001, "loss": 0.8155, "step": 825 }, { "epoch": 5.514950166112957, "grad_norm": 0.73046875, "learning_rate": 9.883647341986032e-05, "loss": 0.8016, "step": 830 }, { "epoch": 5.548172757475083, "grad_norm": 0.6796875, "learning_rate": 9.767310436259438e-05, "loss": 0.8013, "step": 835 }, { "epoch": 5.5813953488372094, "grad_norm": 0.81640625, "learning_rate": 9.651005032974994e-05, "loss": 0.8123, "step": 840 }, { "epoch": 5.614617940199335, "grad_norm": 2.1875, "learning_rate": 9.534746878022534e-05, "loss": 0.8163, "step": 845 }, { "epoch": 5.647840531561462, "grad_norm": 0.72265625, "learning_rate": 9.418551710895243e-05, "loss": 0.8164, "step": 850 }, { "epoch": 5.681063122923588, "grad_norm": 1.6953125, "learning_rate": 9.302435262558747e-05, "loss": 0.7974, "step": 855 }, { "epoch": 5.714285714285714, "grad_norm": 0.76953125, "learning_rate": 9.186413253321418e-05, "loss": 0.8142, "step": 860 }, { "epoch": 5.74750830564784, "grad_norm": 1.109375, "learning_rate": 9.070501390706079e-05, "loss": 0.8026, "step": 865 }, { "epoch": 5.780730897009967, "grad_norm": 0.640625, "learning_rate": 8.954715367323468e-05, "loss": 0.8005, "step": 870 }, { "epoch": 5.813953488372093, "grad_norm": 0.85546875, "learning_rate": 8.839070858747697e-05, "loss": 0.8015, "step": 875 }, { "epoch": 5.847176079734219, "grad_norm": 0.52734375, "learning_rate": 8.723583521394054e-05, "loss": 0.7924, "step": 880 }, { "epoch": 5.880398671096345, "grad_norm": 0.59765625, "learning_rate": 8.608268990399349e-05, "loss": 0.812, "step": 885 }, { "epoch": 5.913621262458472, "grad_norm": 0.70703125, "learning_rate": 8.49314287750517e-05, "loss": 0.7969, "step": 890 }, { "epoch": 5.946843853820598, "grad_norm": 0.74609375, "learning_rate": 8.378220768944327e-05, "loss": 0.7965, "step": 895 }, { "epoch": 5.980066445182724, "grad_norm": 2.015625, "learning_rate": 8.263518223330697e-05, "loss": 0.796, "step": 900 }, { "epoch": 6.0, "eval_loss": 2.531708240509033, "eval_runtime": 0.239, "eval_samples_per_second": 41.85, "eval_steps_per_second": 4.185, "step": 903 }, { "epoch": 6.01328903654485, "grad_norm": 0.482421875, "learning_rate": 8.149050769552856e-05, "loss": 0.7892, "step": 905 }, { "epoch": 6.046511627906977, "grad_norm": 0.65234375, "learning_rate": 8.034833904671698e-05, "loss": 0.7792, "step": 910 }, { "epoch": 6.079734219269103, "grad_norm": 0.7578125, "learning_rate": 7.920883091822408e-05, "loss": 0.7814, "step": 915 }, { "epoch": 6.112956810631229, "grad_norm": 0.484375, "learning_rate": 7.807213758120966e-05, "loss": 0.7822, "step": 920 }, { "epoch": 6.146179401993355, "grad_norm": 0.80859375, "learning_rate": 7.693841292575598e-05, "loss": 0.7749, "step": 925 }, { "epoch": 6.179401993355482, "grad_norm": 0.81640625, "learning_rate": 7.580781044003324e-05, "loss": 0.7821, "step": 930 }, { "epoch": 6.212624584717608, "grad_norm": 5.34375, "learning_rate": 7.468048318951983e-05, "loss": 0.7872, "step": 935 }, { "epoch": 6.245847176079734, "grad_norm": 2.21875, "learning_rate": 7.35565837962798e-05, "loss": 0.7855, "step": 940 }, { "epoch": 6.27906976744186, "grad_norm": 3.28125, "learning_rate": 7.243626441830009e-05, "loss": 0.7763, "step": 945 }, { "epoch": 6.312292358803987, "grad_norm": 0.62890625, "learning_rate": 7.131967672889101e-05, "loss": 0.7901, "step": 950 }, { "epoch": 6.3455149501661126, "grad_norm": 0.9765625, "learning_rate": 7.02069718961518e-05, "loss": 0.7814, "step": 955 }, { "epoch": 6.378737541528239, "grad_norm": 0.8203125, "learning_rate": 6.909830056250527e-05, "loss": 0.7752, "step": 960 }, { "epoch": 6.411960132890365, "grad_norm": 0.92578125, "learning_rate": 6.799381282430284e-05, "loss": 0.7782, "step": 965 }, { "epoch": 6.445182724252492, "grad_norm": 0.91796875, "learning_rate": 6.68936582115042e-05, "loss": 0.7748, "step": 970 }, { "epoch": 6.4784053156146175, "grad_norm": 1.1328125, "learning_rate": 6.579798566743314e-05, "loss": 0.7815, "step": 975 }, { "epoch": 6.511627906976744, "grad_norm": 3.734375, "learning_rate": 6.470694352861312e-05, "loss": 0.7747, "step": 980 }, { "epoch": 6.544850498338871, "grad_norm": 0.6015625, "learning_rate": 6.362067950468489e-05, "loss": 0.785, "step": 985 }, { "epoch": 6.578073089700997, "grad_norm": 0.73828125, "learning_rate": 6.25393406584088e-05, "loss": 0.7716, "step": 990 }, { "epoch": 6.6112956810631225, "grad_norm": 0.79296875, "learning_rate": 6.146307338575519e-05, "loss": 0.7723, "step": 995 }, { "epoch": 6.644518272425249, "grad_norm": 0.69921875, "learning_rate": 6.039202339608432e-05, "loss": 0.7745, "step": 1000 }, { "epoch": 6.677740863787376, "grad_norm": 1.96875, "learning_rate": 5.9326335692419995e-05, "loss": 0.7848, "step": 1005 }, { "epoch": 6.710963455149502, "grad_norm": 0.734375, "learning_rate": 5.8266154551818216e-05, "loss": 0.7797, "step": 1010 }, { "epoch": 6.7441860465116275, "grad_norm": 0.474609375, "learning_rate": 5.72116235058346e-05, "loss": 0.7714, "step": 1015 }, { "epoch": 6.777408637873754, "grad_norm": 0.478515625, "learning_rate": 5.616288532109225e-05, "loss": 0.7716, "step": 1020 }, { "epoch": 6.810631229235881, "grad_norm": 0.494140625, "learning_rate": 5.5120081979953785e-05, "loss": 0.7807, "step": 1025 }, { "epoch": 6.843853820598007, "grad_norm": 0.65234375, "learning_rate": 5.4083354661298814e-05, "loss": 0.7647, "step": 1030 }, { "epoch": 6.877076411960132, "grad_norm": 0.6328125, "learning_rate": 5.305284372141095e-05, "loss": 0.7755, "step": 1035 }, { "epoch": 6.910299003322259, "grad_norm": 0.4765625, "learning_rate": 5.2028688674975415e-05, "loss": 0.7738, "step": 1040 }, { "epoch": 6.943521594684386, "grad_norm": 0.5625, "learning_rate": 5.101102817619131e-05, "loss": 0.7765, "step": 1045 }, { "epoch": 6.976744186046512, "grad_norm": 0.70703125, "learning_rate": 5.000000000000002e-05, "loss": 0.7745, "step": 1050 }, { "epoch": 6.996677740863787, "eval_loss": 2.5435612201690674, "eval_runtime": 0.2585, "eval_samples_per_second": 38.679, "eval_steps_per_second": 3.868, "step": 1053 }, { "epoch": 7.009966777408638, "grad_norm": 0.53125, "learning_rate": 4.899574102343247e-05, "loss": 0.771, "step": 1055 }, { "epoch": 7.043189368770764, "grad_norm": 0.640625, "learning_rate": 4.799838720707846e-05, "loss": 0.7653, "step": 1060 }, { "epoch": 7.076411960132891, "grad_norm": 0.52734375, "learning_rate": 4.700807357667952e-05, "loss": 0.7644, "step": 1065 }, { "epoch": 7.1096345514950166, "grad_norm": 0.490234375, "learning_rate": 4.6024934204848745e-05, "loss": 0.7632, "step": 1070 }, { "epoch": 7.142857142857143, "grad_norm": 0.55859375, "learning_rate": 4.50491021929194e-05, "loss": 0.7686, "step": 1075 }, { "epoch": 7.176079734219269, "grad_norm": 0.46484375, "learning_rate": 4.4080709652925336e-05, "loss": 0.7549, "step": 1080 }, { "epoch": 7.209302325581396, "grad_norm": 0.58203125, "learning_rate": 4.3119887689714844e-05, "loss": 0.7626, "step": 1085 }, { "epoch": 7.2425249169435215, "grad_norm": 0.5546875, "learning_rate": 4.216676638320135e-05, "loss": 0.7588, "step": 1090 }, { "epoch": 7.275747508305648, "grad_norm": 0.5, "learning_rate": 4.12214747707527e-05, "loss": 0.7583, "step": 1095 }, { "epoch": 7.308970099667774, "grad_norm": 0.6015625, "learning_rate": 4.028414082972141e-05, "loss": 0.7529, "step": 1100 }, { "epoch": 7.342192691029901, "grad_norm": 0.72265625, "learning_rate": 3.935489146011869e-05, "loss": 0.766, "step": 1105 }, { "epoch": 7.3754152823920265, "grad_norm": 0.46875, "learning_rate": 3.843385246743417e-05, "loss": 0.7592, "step": 1110 }, { "epoch": 7.408637873754153, "grad_norm": 0.431640625, "learning_rate": 3.7521148545604e-05, "loss": 0.7645, "step": 1115 }, { "epoch": 7.441860465116279, "grad_norm": 0.455078125, "learning_rate": 3.661690326012897e-05, "loss": 0.7629, "step": 1120 }, { "epoch": 7.475083056478406, "grad_norm": 0.4765625, "learning_rate": 3.5721239031346066e-05, "loss": 0.7591, "step": 1125 }, { "epoch": 7.5083056478405314, "grad_norm": 0.71484375, "learning_rate": 3.483427711785449e-05, "loss": 0.7558, "step": 1130 }, { "epoch": 7.541528239202658, "grad_norm": 0.53515625, "learning_rate": 3.395613760009925e-05, "loss": 0.7611, "step": 1135 }, { "epoch": 7.574750830564784, "grad_norm": 0.56640625, "learning_rate": 3.308693936411421e-05, "loss": 0.7619, "step": 1140 }, { "epoch": 7.607973421926911, "grad_norm": 0.44921875, "learning_rate": 3.222680008542678e-05, "loss": 0.7585, "step": 1145 }, { "epoch": 7.641196013289036, "grad_norm": 0.490234375, "learning_rate": 3.137583621312665e-05, "loss": 0.7551, "step": 1150 }, { "epoch": 7.674418604651163, "grad_norm": 0.490234375, "learning_rate": 3.053416295410026e-05, "loss": 0.7626, "step": 1155 }, { "epoch": 7.707641196013289, "grad_norm": 0.5, "learning_rate": 2.9701894257433826e-05, "loss": 0.764, "step": 1160 }, { "epoch": 7.740863787375416, "grad_norm": 0.46875, "learning_rate": 2.8879142798986292e-05, "loss": 0.755, "step": 1165 }, { "epoch": 7.774086378737541, "grad_norm": 0.46875, "learning_rate": 2.8066019966134904e-05, "loss": 0.7563, "step": 1170 }, { "epoch": 7.807308970099668, "grad_norm": 0.451171875, "learning_rate": 2.7262635842695127e-05, "loss": 0.7688, "step": 1175 }, { "epoch": 7.840531561461794, "grad_norm": 0.546875, "learning_rate": 2.6469099194017143e-05, "loss": 0.7665, "step": 1180 }, { "epoch": 7.8737541528239205, "grad_norm": 0.4453125, "learning_rate": 2.5685517452260567e-05, "loss": 0.7664, "step": 1185 }, { "epoch": 7.906976744186046, "grad_norm": 0.443359375, "learning_rate": 2.491199670185008e-05, "loss": 0.753, "step": 1190 }, { "epoch": 7.940199335548173, "grad_norm": 0.44921875, "learning_rate": 2.4148641665113113e-05, "loss": 0.7614, "step": 1195 }, { "epoch": 7.973421926910299, "grad_norm": 0.484375, "learning_rate": 2.339555568810221e-05, "loss": 0.7582, "step": 1200 }, { "epoch": 8.0, "eval_loss": 2.5521774291992188, "eval_runtime": 0.24, "eval_samples_per_second": 41.669, "eval_steps_per_second": 4.167, "step": 1204 }, { "epoch": 8.006644518272426, "grad_norm": 0.423828125, "learning_rate": 2.265284072660362e-05, "loss": 0.7646, "step": 1205 }, { "epoch": 8.039867109634551, "grad_norm": 0.44140625, "learning_rate": 2.192059733233408e-05, "loss": 0.758, "step": 1210 }, { "epoch": 8.073089700996677, "grad_norm": 0.439453125, "learning_rate": 2.119892463932781e-05, "loss": 0.7566, "step": 1215 }, { "epoch": 8.106312292358805, "grad_norm": 0.451171875, "learning_rate": 2.0487920350515212e-05, "loss": 0.7551, "step": 1220 }, { "epoch": 8.13953488372093, "grad_norm": 0.4375, "learning_rate": 1.9787680724495617e-05, "loss": 0.7421, "step": 1225 }, { "epoch": 8.172757475083056, "grad_norm": 0.44921875, "learning_rate": 1.9098300562505266e-05, "loss": 0.7513, "step": 1230 }, { "epoch": 8.205980066445182, "grad_norm": 0.44921875, "learning_rate": 1.8419873195582814e-05, "loss": 0.7578, "step": 1235 }, { "epoch": 8.23920265780731, "grad_norm": 0.421875, "learning_rate": 1.775249047193377e-05, "loss": 0.7518, "step": 1240 }, { "epoch": 8.272425249169435, "grad_norm": 0.498046875, "learning_rate": 1.7096242744495837e-05, "loss": 0.7519, "step": 1245 }, { "epoch": 8.305647840531561, "grad_norm": 0.5390625, "learning_rate": 1.6451218858706374e-05, "loss": 0.7514, "step": 1250 }, { "epoch": 8.338870431893687, "grad_norm": 0.43359375, "learning_rate": 1.5817506140474247e-05, "loss": 0.7553, "step": 1255 }, { "epoch": 8.372093023255815, "grad_norm": 0.466796875, "learning_rate": 1.5195190384357404e-05, "loss": 0.7487, "step": 1260 }, { "epoch": 8.40531561461794, "grad_norm": 0.43359375, "learning_rate": 1.458435584194745e-05, "loss": 0.7518, "step": 1265 }, { "epoch": 8.438538205980066, "grad_norm": 0.4296875, "learning_rate": 1.3985085210463477e-05, "loss": 0.7487, "step": 1270 }, { "epoch": 8.471760797342192, "grad_norm": 0.423828125, "learning_rate": 1.339745962155613e-05, "loss": 0.7467, "step": 1275 }, { "epoch": 8.50498338870432, "grad_norm": 0.4296875, "learning_rate": 1.2821558630323772e-05, "loss": 0.7478, "step": 1280 }, { "epoch": 8.538205980066445, "grad_norm": 0.46875, "learning_rate": 1.2257460204541794e-05, "loss": 0.7558, "step": 1285 }, { "epoch": 8.571428571428571, "grad_norm": 0.44921875, "learning_rate": 1.1705240714107302e-05, "loss": 0.7426, "step": 1290 }, { "epoch": 8.604651162790697, "grad_norm": 0.46875, "learning_rate": 1.116497492069961e-05, "loss": 0.7411, "step": 1295 }, { "epoch": 8.637873754152825, "grad_norm": 0.44140625, "learning_rate": 1.0636735967658784e-05, "loss": 0.7524, "step": 1300 }, { "epoch": 8.67109634551495, "grad_norm": 0.453125, "learning_rate": 1.0120595370083318e-05, "loss": 0.7499, "step": 1305 }, { "epoch": 8.704318936877076, "grad_norm": 0.435546875, "learning_rate": 9.616623005147951e-06, "loss": 0.7603, "step": 1310 }, { "epoch": 8.737541528239202, "grad_norm": 0.44140625, "learning_rate": 9.124887102643575e-06, "loss": 0.7563, "step": 1315 }, { "epoch": 8.77076411960133, "grad_norm": 0.4296875, "learning_rate": 8.645454235739903e-06, "loss": 0.7594, "step": 1320 }, { "epoch": 8.803986710963455, "grad_norm": 0.431640625, "learning_rate": 8.178389311972612e-06, "loss": 0.7648, "step": 1325 }, { "epoch": 8.837209302325581, "grad_norm": 0.443359375, "learning_rate": 7.72375556445577e-06, "loss": 0.7555, "step": 1330 }, { "epoch": 8.870431893687707, "grad_norm": 0.44140625, "learning_rate": 7.281614543321269e-06, "loss": 0.7461, "step": 1335 }, { "epoch": 8.903654485049834, "grad_norm": 0.470703125, "learning_rate": 6.852026107385756e-06, "loss": 0.7606, "step": 1340 }, { "epoch": 8.93687707641196, "grad_norm": 0.435546875, "learning_rate": 6.435048416046863e-06, "loss": 0.7598, "step": 1345 }, { "epoch": 8.970099667774086, "grad_norm": 0.439453125, "learning_rate": 6.030737921409169e-06, "loss": 0.754, "step": 1350 }, { "epoch": 8.996677740863788, "eval_loss": 2.5503978729248047, "eval_runtime": 0.2601, "eval_samples_per_second": 38.445, "eval_steps_per_second": 3.845, "step": 1354 }, { "epoch": 9.003322259136212, "grad_norm": 0.4375, "learning_rate": 5.639149360641649e-06, "loss": 0.7546, "step": 1355 }, { "epoch": 9.03654485049834, "grad_norm": 0.439453125, "learning_rate": 5.26033574856708e-06, "loss": 0.7562, "step": 1360 }, { "epoch": 9.069767441860465, "grad_norm": 0.419921875, "learning_rate": 4.8943483704846475e-06, "loss": 0.7522, "step": 1365 }, { "epoch": 9.102990033222591, "grad_norm": 0.427734375, "learning_rate": 4.541236775226809e-06, "loss": 0.7522, "step": 1370 }, { "epoch": 9.136212624584717, "grad_norm": 0.427734375, "learning_rate": 4.20104876845111e-06, "loss": 0.7509, "step": 1375 }, { "epoch": 9.169435215946844, "grad_norm": 0.41796875, "learning_rate": 3.873830406168111e-06, "loss": 0.7444, "step": 1380 }, { "epoch": 9.20265780730897, "grad_norm": 0.453125, "learning_rate": 3.5596259885061102e-06, "loss": 0.7561, "step": 1385 }, { "epoch": 9.235880398671096, "grad_norm": 0.443359375, "learning_rate": 3.2584780537136207e-06, "loss": 0.7502, "step": 1390 }, { "epoch": 9.269102990033222, "grad_norm": 0.4921875, "learning_rate": 2.970427372400353e-06, "loss": 0.7546, "step": 1395 }, { "epoch": 9.30232558139535, "grad_norm": 0.45703125, "learning_rate": 2.6955129420176196e-06, "loss": 0.7506, "step": 1400 }, { "epoch": 9.335548172757475, "grad_norm": 0.4296875, "learning_rate": 2.433771981578581e-06, "loss": 0.7531, "step": 1405 }, { "epoch": 9.368770764119601, "grad_norm": 0.427734375, "learning_rate": 2.1852399266194314e-06, "loss": 0.75, "step": 1410 }, { "epoch": 9.401993355481727, "grad_norm": 0.4765625, "learning_rate": 1.9499504244020693e-06, "loss": 0.7449, "step": 1415 }, { "epoch": 9.435215946843854, "grad_norm": 0.416015625, "learning_rate": 1.7279353293586765e-06, "loss": 0.765, "step": 1420 }, { "epoch": 9.46843853820598, "grad_norm": 0.44921875, "learning_rate": 1.5192246987791981e-06, "loss": 0.7472, "step": 1425 }, { "epoch": 9.501661129568106, "grad_norm": 0.431640625, "learning_rate": 1.323846788742078e-06, "loss": 0.7461, "step": 1430 }, { "epoch": 9.534883720930232, "grad_norm": 0.443359375, "learning_rate": 1.14182805028884e-06, "loss": 0.7501, "step": 1435 }, { "epoch": 9.56810631229236, "grad_norm": 0.43359375, "learning_rate": 9.731931258429638e-07, "loss": 0.7501, "step": 1440 }, { "epoch": 9.601328903654485, "grad_norm": 0.41796875, "learning_rate": 8.17964845873831e-07, "loss": 0.7511, "step": 1445 }, { "epoch": 9.634551495016611, "grad_norm": 0.427734375, "learning_rate": 6.761642258056978e-07, "loss": 0.7556, "step": 1450 }, { "epoch": 9.667774086378738, "grad_norm": 0.42578125, "learning_rate": 5.478104631726711e-07, "loss": 0.751, "step": 1455 }, { "epoch": 9.700996677740864, "grad_norm": 0.421875, "learning_rate": 4.329209350195651e-07, "loss": 0.7598, "step": 1460 }, { "epoch": 9.73421926910299, "grad_norm": 0.4375, "learning_rate": 3.315111955493944e-07, "loss": 0.7572, "step": 1465 }, { "epoch": 9.767441860465116, "grad_norm": 0.46484375, "learning_rate": 2.4359497401758024e-07, "loss": 0.7478, "step": 1470 }, { "epoch": 9.800664451827242, "grad_norm": 0.419921875, "learning_rate": 1.6918417287318245e-07, "loss": 0.749, "step": 1475 }, { "epoch": 9.83388704318937, "grad_norm": 0.44921875, "learning_rate": 1.0828886614754341e-07, "loss": 0.7488, "step": 1480 }, { "epoch": 9.867109634551495, "grad_norm": 0.4609375, "learning_rate": 6.09172980904238e-08, "loss": 0.7407, "step": 1485 }, { "epoch": 9.90033222591362, "grad_norm": 0.43359375, "learning_rate": 2.7075882053828605e-08, "loss": 0.7491, "step": 1490 }, { "epoch": 9.933554817275748, "grad_norm": 0.447265625, "learning_rate": 6.769199623779532e-09, "loss": 0.7417, "step": 1495 }, { "epoch": 9.966777408637874, "grad_norm": 0.435546875, "learning_rate": 0.0, "loss": 0.7572, "step": 1500 }, { "epoch": 9.966777408637874, "eval_loss": 2.5546562671661377, "eval_runtime": 0.2333, "eval_samples_per_second": 42.867, "eval_steps_per_second": 4.287, "step": 1500 }, { "epoch": 9.966777408637874, "step": 1500, "total_flos": 4.5794490708666614e+18, "train_loss": 1.5882705609003702, "train_runtime": 3659.0045, "train_samples_per_second": 26.291, "train_steps_per_second": 0.41 } ], "logging_steps": 5, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 4.5794490708666614e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }