{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.994174757281553, "eval_steps": 500, "global_step": 771, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003883495145631068, "grad_norm": 2.65625, "learning_rate": 2.564102564102564e-06, "loss": 1.8569, "step": 1 }, { "epoch": 0.019417475728155338, "grad_norm": 2.609375, "learning_rate": 1.282051282051282e-05, "loss": 1.8227, "step": 5 }, { "epoch": 0.038834951456310676, "grad_norm": 2.09375, "learning_rate": 2.564102564102564e-05, "loss": 1.6509, "step": 10 }, { "epoch": 0.05825242718446602, "grad_norm": 1.21875, "learning_rate": 3.846153846153846e-05, "loss": 1.3453, "step": 15 }, { "epoch": 0.07766990291262135, "grad_norm": 0.94921875, "learning_rate": 5.128205128205128e-05, "loss": 1.1542, "step": 20 }, { "epoch": 0.0970873786407767, "grad_norm": 0.5546875, "learning_rate": 6.410256410256412e-05, "loss": 1.0378, "step": 25 }, { "epoch": 0.11650485436893204, "grad_norm": 0.51171875, "learning_rate": 7.692307692307693e-05, "loss": 0.9156, "step": 30 }, { "epoch": 0.13592233009708737, "grad_norm": 0.423828125, "learning_rate": 8.974358974358975e-05, "loss": 0.7894, "step": 35 }, { "epoch": 0.1553398058252427, "grad_norm": 0.333984375, "learning_rate": 0.00010256410256410256, "loss": 0.7181, "step": 40 }, { "epoch": 0.17475728155339806, "grad_norm": 0.353515625, "learning_rate": 0.00011538461538461538, "loss": 0.6847, "step": 45 }, { "epoch": 0.1941747572815534, "grad_norm": 0.392578125, "learning_rate": 0.00012820512820512823, "loss": 0.6471, "step": 50 }, { "epoch": 0.21359223300970873, "grad_norm": 0.345703125, "learning_rate": 0.00014102564102564104, "loss": 0.6139, "step": 55 }, { "epoch": 0.23300970873786409, "grad_norm": 0.3671875, "learning_rate": 0.00015384615384615385, "loss": 0.6034, "step": 60 }, { "epoch": 0.2524271844660194, "grad_norm": 0.376953125, "learning_rate": 0.0001666666666666667, "loss": 0.563, "step": 65 }, { "epoch": 0.27184466019417475, "grad_norm": 0.396484375, "learning_rate": 0.0001794871794871795, "loss": 0.5629, "step": 70 }, { "epoch": 0.2912621359223301, "grad_norm": 0.365234375, "learning_rate": 0.00019230769230769233, "loss": 0.5443, "step": 75 }, { "epoch": 0.3106796116504854, "grad_norm": 0.34375, "learning_rate": 0.0001999958898251569, "loss": 0.5345, "step": 80 }, { "epoch": 0.3300970873786408, "grad_norm": 0.33203125, "learning_rate": 0.00019994965423831854, "loss": 0.5196, "step": 85 }, { "epoch": 0.34951456310679613, "grad_norm": 0.330078125, "learning_rate": 0.00019985206917896563, "loss": 0.5243, "step": 90 }, { "epoch": 0.36893203883495146, "grad_norm": 0.3203125, "learning_rate": 0.00019970318478175218, "loss": 0.5218, "step": 95 }, { "epoch": 0.3883495145631068, "grad_norm": 0.28515625, "learning_rate": 0.00019950307753654017, "loss": 0.5253, "step": 100 }, { "epoch": 0.4077669902912621, "grad_norm": 0.326171875, "learning_rate": 0.00019925185024910277, "loss": 0.5084, "step": 105 }, { "epoch": 0.42718446601941745, "grad_norm": 0.376953125, "learning_rate": 0.00019894963198830768, "loss": 0.48, "step": 110 }, { "epoch": 0.44660194174757284, "grad_norm": 0.287109375, "learning_rate": 0.00019859657801980733, "loss": 0.4837, "step": 115 }, { "epoch": 0.46601941747572817, "grad_norm": 0.29296875, "learning_rate": 0.00019819286972627066, "loss": 0.4751, "step": 120 }, { "epoch": 0.4854368932038835, "grad_norm": 0.28125, "learning_rate": 0.00019773871451419736, "loss": 0.4962, "step": 125 }, { "epoch": 0.5048543689320388, "grad_norm": 0.30859375, "learning_rate": 0.00019723434570736181, "loss": 0.5064, "step": 130 }, { "epoch": 0.5242718446601942, "grad_norm": 0.283203125, "learning_rate": 0.00019668002242694238, "loss": 0.4812, "step": 135 }, { "epoch": 0.5436893203883495, "grad_norm": 0.3671875, "learning_rate": 0.00019607602945839698, "loss": 0.4815, "step": 140 }, { "epoch": 0.5631067961165048, "grad_norm": 0.267578125, "learning_rate": 0.00019542267710515368, "loss": 0.4763, "step": 145 }, { "epoch": 0.5825242718446602, "grad_norm": 0.287109375, "learning_rate": 0.000194720301029191, "loss": 0.4682, "step": 150 }, { "epoch": 0.6019417475728155, "grad_norm": 0.298828125, "learning_rate": 0.00019396926207859084, "loss": 0.4604, "step": 155 }, { "epoch": 0.6213592233009708, "grad_norm": 0.26953125, "learning_rate": 0.00019316994610215116, "loss": 0.4866, "step": 160 }, { "epoch": 0.6407766990291263, "grad_norm": 0.271484375, "learning_rate": 0.00019232276375115515, "loss": 0.4898, "step": 165 }, { "epoch": 0.6601941747572816, "grad_norm": 0.2578125, "learning_rate": 0.00019142815026839755, "loss": 0.4908, "step": 170 }, { "epoch": 0.6796116504854369, "grad_norm": 0.28515625, "learning_rate": 0.0001904865652645773, "loss": 0.4566, "step": 175 }, { "epoch": 0.6990291262135923, "grad_norm": 0.27734375, "learning_rate": 0.000189498492482171, "loss": 0.45, "step": 180 }, { "epoch": 0.7184466019417476, "grad_norm": 0.259765625, "learning_rate": 0.00018846443954690848, "loss": 0.4646, "step": 185 }, { "epoch": 0.7378640776699029, "grad_norm": 0.275390625, "learning_rate": 0.00018738493770697852, "loss": 0.4625, "step": 190 }, { "epoch": 0.7572815533980582, "grad_norm": 0.26171875, "learning_rate": 0.00018626054156009806, "loss": 0.4452, "step": 195 }, { "epoch": 0.7766990291262136, "grad_norm": 0.29296875, "learning_rate": 0.00018509182876858611, "loss": 0.4535, "step": 200 }, { "epoch": 0.7961165048543689, "grad_norm": 0.25390625, "learning_rate": 0.00018387939976258734, "loss": 0.4393, "step": 205 }, { "epoch": 0.8155339805825242, "grad_norm": 0.263671875, "learning_rate": 0.0001826238774315995, "loss": 0.4391, "step": 210 }, { "epoch": 0.8349514563106796, "grad_norm": 0.271484375, "learning_rate": 0.00018132590680446147, "loss": 0.443, "step": 215 }, { "epoch": 0.8543689320388349, "grad_norm": 0.279296875, "learning_rate": 0.00017998615471796775, "loss": 0.4527, "step": 220 }, { "epoch": 0.8737864077669902, "grad_norm": 0.265625, "learning_rate": 0.00017860530947427875, "loss": 0.4416, "step": 225 }, { "epoch": 0.8932038834951457, "grad_norm": 0.2890625, "learning_rate": 0.00017718408048730317, "loss": 0.4506, "step": 230 }, { "epoch": 0.912621359223301, "grad_norm": 0.298828125, "learning_rate": 0.00017572319791823424, "loss": 0.4538, "step": 235 }, { "epoch": 0.9320388349514563, "grad_norm": 0.263671875, "learning_rate": 0.000174223412300427, "loss": 0.4467, "step": 240 }, { "epoch": 0.9514563106796117, "grad_norm": 0.26953125, "learning_rate": 0.00017268549415380916, "loss": 0.4425, "step": 245 }, { "epoch": 0.970873786407767, "grad_norm": 0.267578125, "learning_rate": 0.00017111023358902392, "loss": 0.4389, "step": 250 }, { "epoch": 0.9902912621359223, "grad_norm": 0.251953125, "learning_rate": 0.00016949843990150796, "loss": 0.4177, "step": 255 }, { "epoch": 0.9980582524271845, "eval_loss": 0.4281997084617615, "eval_runtime": 22.0587, "eval_samples_per_second": 4.987, "eval_steps_per_second": 0.635, "step": 257 }, { "epoch": 1.0097087378640777, "grad_norm": 0.236328125, "learning_rate": 0.00016785094115571322, "loss": 0.4083, "step": 260 }, { "epoch": 1.029126213592233, "grad_norm": 0.275390625, "learning_rate": 0.00016616858375968595, "loss": 0.4012, "step": 265 }, { "epoch": 1.0485436893203883, "grad_norm": 0.28515625, "learning_rate": 0.00016445223203022166, "loss": 0.3935, "step": 270 }, { "epoch": 1.0679611650485437, "grad_norm": 0.279296875, "learning_rate": 0.00016270276774881954, "loss": 0.3972, "step": 275 }, { "epoch": 1.087378640776699, "grad_norm": 0.267578125, "learning_rate": 0.00016092108970866423, "loss": 0.4004, "step": 280 }, { "epoch": 1.1067961165048543, "grad_norm": 0.279296875, "learning_rate": 0.00015910811325286768, "loss": 0.3941, "step": 285 }, { "epoch": 1.1262135922330097, "grad_norm": 0.271484375, "learning_rate": 0.00015726476980420864, "loss": 0.3877, "step": 290 }, { "epoch": 1.145631067961165, "grad_norm": 0.287109375, "learning_rate": 0.00015539200638661104, "loss": 0.3819, "step": 295 }, { "epoch": 1.1650485436893203, "grad_norm": 0.291015625, "learning_rate": 0.00015349078513860726, "loss": 0.3816, "step": 300 }, { "epoch": 1.1844660194174756, "grad_norm": 0.318359375, "learning_rate": 0.00015156208281903613, "loss": 0.3953, "step": 305 }, { "epoch": 1.203883495145631, "grad_norm": 0.296875, "learning_rate": 0.0001496068903052299, "loss": 0.3905, "step": 310 }, { "epoch": 1.2233009708737863, "grad_norm": 0.2734375, "learning_rate": 0.0001476262120839475, "loss": 0.3763, "step": 315 }, { "epoch": 1.2427184466019416, "grad_norm": 0.3125, "learning_rate": 0.0001456210657353163, "loss": 0.3792, "step": 320 }, { "epoch": 1.262135922330097, "grad_norm": 0.29296875, "learning_rate": 0.00014359248141004668, "loss": 0.3794, "step": 325 }, { "epoch": 1.2815533980582523, "grad_norm": 0.294921875, "learning_rate": 0.00014154150130018866, "loss": 0.3756, "step": 330 }, { "epoch": 1.3009708737864076, "grad_norm": 0.287109375, "learning_rate": 0.00013946917910370233, "loss": 0.3876, "step": 335 }, { "epoch": 1.3203883495145632, "grad_norm": 0.306640625, "learning_rate": 0.00013737657948311683, "loss": 0.3819, "step": 340 }, { "epoch": 1.3398058252427185, "grad_norm": 0.26171875, "learning_rate": 0.00013526477751855644, "loss": 0.3925, "step": 345 }, { "epoch": 1.3592233009708738, "grad_norm": 0.287109375, "learning_rate": 0.00013313485815541454, "loss": 0.3915, "step": 350 }, { "epoch": 1.3786407766990292, "grad_norm": 0.28515625, "learning_rate": 0.00013098791564695927, "loss": 0.3902, "step": 355 }, { "epoch": 1.3980582524271845, "grad_norm": 0.2890625, "learning_rate": 0.0001288250529921571, "loss": 0.3864, "step": 360 }, { "epoch": 1.4174757281553398, "grad_norm": 0.27734375, "learning_rate": 0.00012664738136900348, "loss": 0.3778, "step": 365 }, { "epoch": 1.4368932038834952, "grad_norm": 0.2890625, "learning_rate": 0.0001244560195636515, "loss": 0.3732, "step": 370 }, { "epoch": 1.4563106796116505, "grad_norm": 0.318359375, "learning_rate": 0.00012225209339563145, "loss": 0.3856, "step": 375 }, { "epoch": 1.4757281553398058, "grad_norm": 0.265625, "learning_rate": 0.00012003673513945746, "loss": 0.3745, "step": 380 }, { "epoch": 1.4951456310679612, "grad_norm": 0.306640625, "learning_rate": 0.0001178110829429175, "loss": 0.3657, "step": 385 }, { "epoch": 1.5145631067961165, "grad_norm": 0.25390625, "learning_rate": 0.0001155762802423463, "loss": 0.3728, "step": 390 }, { "epoch": 1.5339805825242718, "grad_norm": 0.265625, "learning_rate": 0.0001133334751751809, "loss": 0.3849, "step": 395 }, { "epoch": 1.5533980582524272, "grad_norm": 0.291015625, "learning_rate": 0.00011108381999010111, "loss": 0.3979, "step": 400 }, { "epoch": 1.5728155339805825, "grad_norm": 0.26171875, "learning_rate": 0.00010882847045505808, "loss": 0.3819, "step": 405 }, { "epoch": 1.5922330097087378, "grad_norm": 0.2578125, "learning_rate": 0.00010656858526349449, "loss": 0.3704, "step": 410 }, { "epoch": 1.6116504854368932, "grad_norm": 0.26171875, "learning_rate": 0.00010430532543906179, "loss": 0.3702, "step": 415 }, { "epoch": 1.6310679611650487, "grad_norm": 0.283203125, "learning_rate": 0.00010203985373914056, "loss": 0.373, "step": 420 }, { "epoch": 1.650485436893204, "grad_norm": 0.279296875, "learning_rate": 9.977333405746979e-05, "loss": 0.366, "step": 425 }, { "epoch": 1.6699029126213594, "grad_norm": 0.30859375, "learning_rate": 9.750693082619273e-05, "loss": 0.3555, "step": 430 }, { "epoch": 1.6893203883495147, "grad_norm": 0.279296875, "learning_rate": 9.524180841762577e-05, "loss": 0.3793, "step": 435 }, { "epoch": 1.70873786407767, "grad_norm": 0.25390625, "learning_rate": 9.297913054605838e-05, "loss": 0.3692, "step": 440 }, { "epoch": 1.7281553398058254, "grad_norm": 0.27734375, "learning_rate": 9.072005966989084e-05, "loss": 0.3655, "step": 445 }, { "epoch": 1.7475728155339807, "grad_norm": 0.265625, "learning_rate": 8.846575639441732e-05, "loss": 0.3791, "step": 450 }, { "epoch": 1.766990291262136, "grad_norm": 0.29296875, "learning_rate": 8.621737887556114e-05, "loss": 0.3661, "step": 455 }, { "epoch": 1.7864077669902914, "grad_norm": 0.2890625, "learning_rate": 8.397608222486805e-05, "loss": 0.3611, "step": 460 }, { "epoch": 1.8058252427184467, "grad_norm": 0.291015625, "learning_rate": 8.174301791606385e-05, "loss": 0.3611, "step": 465 }, { "epoch": 1.825242718446602, "grad_norm": 0.279296875, "learning_rate": 7.951933319348095e-05, "loss": 0.3493, "step": 470 }, { "epoch": 1.8446601941747574, "grad_norm": 0.2578125, "learning_rate": 7.730617048265761e-05, "loss": 0.3655, "step": 475 }, { "epoch": 1.8640776699029127, "grad_norm": 0.318359375, "learning_rate": 7.510466680341301e-05, "loss": 0.3527, "step": 480 }, { "epoch": 1.883495145631068, "grad_norm": 0.298828125, "learning_rate": 7.291595318569951e-05, "loss": 0.3528, "step": 485 }, { "epoch": 1.9029126213592233, "grad_norm": 0.30078125, "learning_rate": 7.074115408853203e-05, "loss": 0.3784, "step": 490 }, { "epoch": 1.9223300970873787, "grad_norm": 0.27734375, "learning_rate": 6.858138682229376e-05, "loss": 0.3571, "step": 495 }, { "epoch": 1.941747572815534, "grad_norm": 0.265625, "learning_rate": 6.643776097471377e-05, "loss": 0.3658, "step": 500 }, { "epoch": 1.9611650485436893, "grad_norm": 0.275390625, "learning_rate": 6.431137784081282e-05, "loss": 0.3567, "step": 505 }, { "epoch": 1.9805825242718447, "grad_norm": 0.29296875, "learning_rate": 6.220332985710936e-05, "loss": 0.3526, "step": 510 }, { "epoch": 2.0, "grad_norm": 0.30078125, "learning_rate": 6.011470004037636e-05, "loss": 0.361, "step": 515 }, { "epoch": 2.0, "eval_loss": 0.38420218229293823, "eval_runtime": 22.0532, "eval_samples_per_second": 4.988, "eval_steps_per_second": 0.635, "step": 515 }, { "epoch": 2.0194174757281553, "grad_norm": 0.2421875, "learning_rate": 5.804656143123801e-05, "loss": 0.308, "step": 520 }, { "epoch": 2.0388349514563107, "grad_norm": 0.283203125, "learning_rate": 5.599997654289129e-05, "loss": 0.3136, "step": 525 }, { "epoch": 2.058252427184466, "grad_norm": 0.3046875, "learning_rate": 5.397599681523643e-05, "loss": 0.3037, "step": 530 }, { "epoch": 2.0776699029126213, "grad_norm": 0.298828125, "learning_rate": 5.1975662074695865e-05, "loss": 0.3098, "step": 535 }, { "epoch": 2.0970873786407767, "grad_norm": 0.28125, "learning_rate": 5.000000000000002e-05, "loss": 0.3111, "step": 540 }, { "epoch": 2.116504854368932, "grad_norm": 0.294921875, "learning_rate": 4.8050025594214e-05, "loss": 0.3083, "step": 545 }, { "epoch": 2.1359223300970873, "grad_norm": 0.29296875, "learning_rate": 4.6126740663276166e-05, "loss": 0.3133, "step": 550 }, { "epoch": 2.1553398058252426, "grad_norm": 0.29296875, "learning_rate": 4.423113330131707e-05, "loss": 0.3076, "step": 555 }, { "epoch": 2.174757281553398, "grad_norm": 0.29296875, "learning_rate": 4.236417738302257e-05, "loss": 0.2981, "step": 560 }, { "epoch": 2.1941747572815533, "grad_norm": 0.314453125, "learning_rate": 4.052683206330267e-05, "loss": 0.312, "step": 565 }, { "epoch": 2.2135922330097086, "grad_norm": 0.298828125, "learning_rate": 3.872004128452231e-05, "loss": 0.2942, "step": 570 }, { "epoch": 2.233009708737864, "grad_norm": 0.314453125, "learning_rate": 3.694473329154778e-05, "loss": 0.3205, "step": 575 }, { "epoch": 2.2524271844660193, "grad_norm": 0.3359375, "learning_rate": 3.5201820154857755e-05, "loss": 0.322, "step": 580 }, { "epoch": 2.2718446601941746, "grad_norm": 0.296875, "learning_rate": 3.3492197301964145e-05, "loss": 0.2931, "step": 585 }, { "epoch": 2.29126213592233, "grad_norm": 0.33203125, "learning_rate": 3.18167430573831e-05, "loss": 0.3107, "step": 590 }, { "epoch": 2.3106796116504853, "grad_norm": 0.302734375, "learning_rate": 3.0176318191392726e-05, "loss": 0.3065, "step": 595 }, { "epoch": 2.3300970873786406, "grad_norm": 0.3203125, "learning_rate": 2.8571765477809643e-05, "loss": 0.3031, "step": 600 }, { "epoch": 2.349514563106796, "grad_norm": 0.3125, "learning_rate": 2.7003909261010928e-05, "loss": 0.2894, "step": 605 }, { "epoch": 2.3689320388349513, "grad_norm": 0.314453125, "learning_rate": 2.5473555032424533e-05, "loss": 0.3136, "step": 610 }, { "epoch": 2.3883495145631066, "grad_norm": 0.33984375, "learning_rate": 2.3981489016705205e-05, "loss": 0.3078, "step": 615 }, { "epoch": 2.407766990291262, "grad_norm": 0.322265625, "learning_rate": 2.2528477767808963e-05, "loss": 0.3094, "step": 620 }, { "epoch": 2.4271844660194173, "grad_norm": 0.31640625, "learning_rate": 2.1115267775173532e-05, "loss": 0.2985, "step": 625 }, { "epoch": 2.4466019417475726, "grad_norm": 0.322265625, "learning_rate": 1.9742585080206755e-05, "loss": 0.3191, "step": 630 }, { "epoch": 2.466019417475728, "grad_norm": 0.3203125, "learning_rate": 1.8411134903280303e-05, "loss": 0.3117, "step": 635 }, { "epoch": 2.4854368932038833, "grad_norm": 0.306640625, "learning_rate": 1.7121601281420495e-05, "loss": 0.3086, "step": 640 }, { "epoch": 2.5048543689320386, "grad_norm": 0.33203125, "learning_rate": 1.587464671688187e-05, "loss": 0.3072, "step": 645 }, { "epoch": 2.524271844660194, "grad_norm": 0.306640625, "learning_rate": 1.467091183678444e-05, "loss": 0.3044, "step": 650 }, { "epoch": 2.5436893203883493, "grad_norm": 0.298828125, "learning_rate": 1.3511015063989274e-05, "loss": 0.3081, "step": 655 }, { "epoch": 2.5631067961165046, "grad_norm": 0.275390625, "learning_rate": 1.2395552299381741e-05, "loss": 0.3005, "step": 660 }, { "epoch": 2.58252427184466, "grad_norm": 0.3125, "learning_rate": 1.1325096615725427e-05, "loss": 0.3034, "step": 665 }, { "epoch": 2.6019417475728153, "grad_norm": 0.3125, "learning_rate": 1.030019796324404e-05, "loss": 0.3075, "step": 670 }, { "epoch": 2.6213592233009706, "grad_norm": 0.3125, "learning_rate": 9.321382887082563e-06, "loss": 0.3084, "step": 675 }, { "epoch": 2.6407766990291264, "grad_norm": 0.3203125, "learning_rate": 8.38915425679304e-06, "loss": 0.3064, "step": 680 }, { "epoch": 2.6601941747572817, "grad_norm": 0.310546875, "learning_rate": 7.503991007983524e-06, "loss": 0.3087, "step": 685 }, { "epoch": 2.679611650485437, "grad_norm": 0.3359375, "learning_rate": 6.666347896263325e-06, "loss": 0.3129, "step": 690 }, { "epoch": 2.6990291262135924, "grad_norm": 0.291015625, "learning_rate": 5.876655263610842e-06, "loss": 0.2897, "step": 695 }, { "epoch": 2.7184466019417477, "grad_norm": 0.31640625, "learning_rate": 5.1353188172838074e-06, "loss": 0.3033, "step": 700 }, { "epoch": 2.737864077669903, "grad_norm": 0.30859375, "learning_rate": 4.442719421385922e-06, "loss": 0.2994, "step": 705 }, { "epoch": 2.7572815533980584, "grad_norm": 0.330078125, "learning_rate": 3.7992129011965803e-06, "loss": 0.3152, "step": 710 }, { "epoch": 2.7766990291262137, "grad_norm": 0.314453125, "learning_rate": 3.2051298603643753e-06, "loss": 0.3101, "step": 715 }, { "epoch": 2.796116504854369, "grad_norm": 0.30859375, "learning_rate": 2.6607755110584887e-06, "loss": 0.3125, "step": 720 }, { "epoch": 2.8155339805825244, "grad_norm": 0.291015625, "learning_rate": 2.1664295171648364e-06, "loss": 0.3024, "step": 725 }, { "epoch": 2.8349514563106797, "grad_norm": 0.31640625, "learning_rate": 1.7223458506077316e-06, "loss": 0.2913, "step": 730 }, { "epoch": 2.854368932038835, "grad_norm": 0.3046875, "learning_rate": 1.3287526608711131e-06, "loss": 0.2968, "step": 735 }, { "epoch": 2.8737864077669903, "grad_norm": 0.3203125, "learning_rate": 9.85852157785816e-07, "loss": 0.3033, "step": 740 }, { "epoch": 2.8932038834951457, "grad_norm": 0.310546875, "learning_rate": 6.938205076436832e-07, "loss": 0.2991, "step": 745 }, { "epoch": 2.912621359223301, "grad_norm": 0.328125, "learning_rate": 4.5280774269154115e-07, "loss": 0.3003, "step": 750 }, { "epoch": 2.9320388349514563, "grad_norm": 0.2890625, "learning_rate": 2.629376840515452e-07, "loss": 0.2999, "step": 755 }, { "epoch": 2.9514563106796117, "grad_norm": 0.32421875, "learning_rate": 1.2430787810776555e-07, "loss": 0.3023, "step": 760 }, { "epoch": 2.970873786407767, "grad_norm": 0.326171875, "learning_rate": 3.6989546391297256e-08, "loss": 0.3106, "step": 765 }, { "epoch": 2.9902912621359223, "grad_norm": 0.318359375, "learning_rate": 1.0275489900624102e-09, "loss": 0.3036, "step": 770 }, { "epoch": 2.994174757281553, "eval_loss": 0.38681623339653015, "eval_runtime": 22.0402, "eval_samples_per_second": 4.991, "eval_steps_per_second": 0.635, "step": 771 }, { "epoch": 2.994174757281553, "step": 771, "total_flos": 5.421128640186286e+17, "train_loss": 0.2601610358741651, "train_runtime": 3322.581, "train_samples_per_second": 1.86, "train_steps_per_second": 0.232 } ], "logging_steps": 5, "max_steps": 771, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.421128640186286e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }