{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1366, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 15.1875, "learning_rate": 1.4598540145985402e-07, "loss": 3.5409, "step": 1 }, { "epoch": 0.0, "grad_norm": 13.625, "learning_rate": 7.299270072992701e-07, "loss": 3.4713, "step": 5 }, { "epoch": 0.01, "grad_norm": 13.0, "learning_rate": 1.4598540145985402e-06, "loss": 3.4904, "step": 10 }, { "epoch": 0.01, "grad_norm": 13.8125, "learning_rate": 2.1897810218978103e-06, "loss": 3.4687, "step": 15 }, { "epoch": 0.01, "grad_norm": 15.9375, "learning_rate": 2.9197080291970804e-06, "loss": 3.4864, "step": 20 }, { "epoch": 0.02, "grad_norm": 13.4375, "learning_rate": 3.6496350364963505e-06, "loss": 3.4514, "step": 25 }, { "epoch": 0.02, "grad_norm": 14.0625, "learning_rate": 4.379562043795621e-06, "loss": 3.4462, "step": 30 }, { "epoch": 0.03, "grad_norm": 13.0, "learning_rate": 5.1094890510948916e-06, "loss": 3.407, "step": 35 }, { "epoch": 0.03, "grad_norm": 15.3125, "learning_rate": 5.839416058394161e-06, "loss": 3.4481, "step": 40 }, { "epoch": 0.03, "grad_norm": 15.25, "learning_rate": 6.569343065693431e-06, "loss": 3.4538, "step": 45 }, { "epoch": 0.04, "grad_norm": 12.75, "learning_rate": 7.299270072992701e-06, "loss": 3.3903, "step": 50 }, { "epoch": 0.04, "grad_norm": 12.25, "learning_rate": 8.029197080291972e-06, "loss": 3.349, "step": 55 }, { "epoch": 0.04, "grad_norm": 13.875, "learning_rate": 8.759124087591241e-06, "loss": 3.357, "step": 60 }, { "epoch": 0.05, "grad_norm": 11.8125, "learning_rate": 9.48905109489051e-06, "loss": 3.2995, "step": 65 }, { "epoch": 0.05, "grad_norm": 11.75, "learning_rate": 1.0218978102189783e-05, "loss": 3.2453, "step": 70 }, { "epoch": 0.05, "grad_norm": 11.875, "learning_rate": 1.0948905109489052e-05, "loss": 3.2581, "step": 75 }, { "epoch": 0.06, "grad_norm": 12.75, "learning_rate": 1.1678832116788322e-05, "loss": 3.2102, "step": 80 }, { "epoch": 0.06, "grad_norm": 10.25, "learning_rate": 1.2408759124087593e-05, "loss": 3.1574, "step": 85 }, { "epoch": 0.07, "grad_norm": 9.875, "learning_rate": 1.3138686131386862e-05, "loss": 3.0819, "step": 90 }, { "epoch": 0.07, "grad_norm": 8.8125, "learning_rate": 1.3868613138686133e-05, "loss": 3.0446, "step": 95 }, { "epoch": 0.07, "grad_norm": 8.4375, "learning_rate": 1.4598540145985402e-05, "loss": 2.961, "step": 100 }, { "epoch": 0.08, "grad_norm": 7.78125, "learning_rate": 1.5328467153284673e-05, "loss": 2.9056, "step": 105 }, { "epoch": 0.08, "grad_norm": 6.5625, "learning_rate": 1.6058394160583944e-05, "loss": 2.8261, "step": 110 }, { "epoch": 0.08, "grad_norm": 5.75, "learning_rate": 1.678832116788321e-05, "loss": 2.7407, "step": 115 }, { "epoch": 0.09, "grad_norm": 5.34375, "learning_rate": 1.7518248175182482e-05, "loss": 2.6689, "step": 120 }, { "epoch": 0.09, "grad_norm": 4.46875, "learning_rate": 1.8248175182481753e-05, "loss": 2.6545, "step": 125 }, { "epoch": 0.1, "grad_norm": 4.59375, "learning_rate": 1.897810218978102e-05, "loss": 2.5854, "step": 130 }, { "epoch": 0.1, "grad_norm": 3.75, "learning_rate": 1.9708029197080295e-05, "loss": 2.5589, "step": 135 }, { "epoch": 0.1, "grad_norm": 3.40625, "learning_rate": 1.99997059600332e-05, "loss": 2.5376, "step": 140 }, { "epoch": 0.11, "grad_norm": 3.390625, "learning_rate": 1.9997909111745443e-05, "loss": 2.5087, "step": 145 }, { "epoch": 0.11, "grad_norm": 3.375, "learning_rate": 1.9994479063873808e-05, "loss": 2.4925, "step": 150 }, { "epoch": 0.11, "grad_norm": 2.984375, "learning_rate": 1.998941637673072e-05, "loss": 2.4023, "step": 155 }, { "epoch": 0.12, "grad_norm": 3.015625, "learning_rate": 1.9982721877326954e-05, "loss": 2.4525, "step": 160 }, { "epoch": 0.12, "grad_norm": 2.890625, "learning_rate": 1.9974396659236522e-05, "loss": 2.4526, "step": 165 }, { "epoch": 0.12, "grad_norm": 3.203125, "learning_rate": 1.9964442082418057e-05, "loss": 2.416, "step": 170 }, { "epoch": 0.13, "grad_norm": 2.953125, "learning_rate": 1.9952859772992626e-05, "loss": 2.4271, "step": 175 }, { "epoch": 0.13, "grad_norm": 3.0, "learning_rate": 1.9939651622978127e-05, "loss": 2.4125, "step": 180 }, { "epoch": 0.14, "grad_norm": 2.71875, "learning_rate": 1.9924819789980204e-05, "loss": 2.421, "step": 185 }, { "epoch": 0.14, "grad_norm": 2.765625, "learning_rate": 1.990836669683979e-05, "loss": 2.3559, "step": 190 }, { "epoch": 0.14, "grad_norm": 2.609375, "learning_rate": 1.9890295031237336e-05, "loss": 2.3899, "step": 195 }, { "epoch": 0.15, "grad_norm": 2.609375, "learning_rate": 1.9870607745253765e-05, "loss": 2.3836, "step": 200 }, { "epoch": 0.15, "grad_norm": 2.859375, "learning_rate": 1.9849308054888235e-05, "loss": 2.3803, "step": 205 }, { "epoch": 0.15, "grad_norm": 2.734375, "learning_rate": 1.9826399439532787e-05, "loss": 2.3862, "step": 210 }, { "epoch": 0.16, "grad_norm": 2.6875, "learning_rate": 1.980188564140399e-05, "loss": 2.332, "step": 215 }, { "epoch": 0.16, "grad_norm": 2.84375, "learning_rate": 1.9775770664931613e-05, "loss": 2.3934, "step": 220 }, { "epoch": 0.16, "grad_norm": 3.03125, "learning_rate": 1.97480587761045e-05, "loss": 2.3672, "step": 225 }, { "epoch": 0.17, "grad_norm": 2.828125, "learning_rate": 1.9718754501773688e-05, "loss": 2.3645, "step": 230 }, { "epoch": 0.17, "grad_norm": 2.703125, "learning_rate": 1.9687862628912952e-05, "loss": 2.3382, "step": 235 }, { "epoch": 0.18, "grad_norm": 2.625, "learning_rate": 1.9655388203836804e-05, "loss": 2.3214, "step": 240 }, { "epoch": 0.18, "grad_norm": 2.6875, "learning_rate": 1.962133653137618e-05, "loss": 2.344, "step": 245 }, { "epoch": 0.18, "grad_norm": 2.5, "learning_rate": 1.958571317401186e-05, "loss": 2.3284, "step": 250 }, { "epoch": 0.19, "grad_norm": 2.421875, "learning_rate": 1.954852395096582e-05, "loss": 2.3248, "step": 255 }, { "epoch": 0.19, "grad_norm": 2.421875, "learning_rate": 1.9509774937250646e-05, "loss": 2.3123, "step": 260 }, { "epoch": 0.19, "grad_norm": 2.546875, "learning_rate": 1.9469472462677128e-05, "loss": 2.2931, "step": 265 }, { "epoch": 0.2, "grad_norm": 2.578125, "learning_rate": 1.9427623110820295e-05, "loss": 2.3049, "step": 270 }, { "epoch": 0.2, "grad_norm": 2.484375, "learning_rate": 1.9384233717943934e-05, "loss": 2.3491, "step": 275 }, { "epoch": 0.2, "grad_norm": 2.78125, "learning_rate": 1.933931137188387e-05, "loss": 2.3268, "step": 280 }, { "epoch": 0.21, "grad_norm": 2.671875, "learning_rate": 1.9292863410890138e-05, "loss": 2.3244, "step": 285 }, { "epoch": 0.21, "grad_norm": 2.34375, "learning_rate": 1.9244897422428254e-05, "loss": 2.2832, "step": 290 }, { "epoch": 0.22, "grad_norm": 2.5625, "learning_rate": 1.919542124193976e-05, "loss": 2.309, "step": 295 }, { "epoch": 0.22, "grad_norm": 2.734375, "learning_rate": 1.9144442951562275e-05, "loss": 2.313, "step": 300 }, { "epoch": 0.22, "grad_norm": 2.609375, "learning_rate": 1.909197087880925e-05, "loss": 2.3164, "step": 305 }, { "epoch": 0.23, "grad_norm": 2.5625, "learning_rate": 1.9038013595209647e-05, "loss": 2.3098, "step": 310 }, { "epoch": 0.23, "grad_norm": 2.53125, "learning_rate": 1.8982579914907706e-05, "loss": 2.3103, "step": 315 }, { "epoch": 0.23, "grad_norm": 2.4375, "learning_rate": 1.8925678893223155e-05, "loss": 2.293, "step": 320 }, { "epoch": 0.24, "grad_norm": 2.375, "learning_rate": 1.8867319825171972e-05, "loss": 2.3174, "step": 325 }, { "epoch": 0.24, "grad_norm": 2.609375, "learning_rate": 1.880751224394801e-05, "loss": 2.3213, "step": 330 }, { "epoch": 0.25, "grad_norm": 2.375, "learning_rate": 1.8746265919365707e-05, "loss": 2.288, "step": 335 }, { "epoch": 0.25, "grad_norm": 2.5, "learning_rate": 1.8683590856264137e-05, "loss": 2.2918, "step": 340 }, { "epoch": 0.25, "grad_norm": 2.5625, "learning_rate": 1.8619497292872702e-05, "loss": 2.2748, "step": 345 }, { "epoch": 0.26, "grad_norm": 2.28125, "learning_rate": 1.855399569913866e-05, "loss": 2.2933, "step": 350 }, { "epoch": 0.26, "grad_norm": 2.390625, "learning_rate": 1.8487096775016807e-05, "loss": 2.3154, "step": 355 }, { "epoch": 0.26, "grad_norm": 2.515625, "learning_rate": 1.841881144872162e-05, "loss": 2.2824, "step": 360 }, { "epoch": 0.27, "grad_norm": 2.1875, "learning_rate": 1.8349150874942067e-05, "loss": 2.3012, "step": 365 }, { "epoch": 0.27, "grad_norm": 2.25, "learning_rate": 1.8278126433019453e-05, "loss": 2.3538, "step": 370 }, { "epoch": 0.27, "grad_norm": 2.546875, "learning_rate": 1.8205749725088556e-05, "loss": 2.2788, "step": 375 }, { "epoch": 0.28, "grad_norm": 2.453125, "learning_rate": 1.8132032574182367e-05, "loss": 2.2883, "step": 380 }, { "epoch": 0.28, "grad_norm": 2.25, "learning_rate": 1.8056987022300746e-05, "loss": 2.3311, "step": 385 }, { "epoch": 0.29, "grad_norm": 2.171875, "learning_rate": 1.798062532844333e-05, "loss": 2.3094, "step": 390 }, { "epoch": 0.29, "grad_norm": 2.203125, "learning_rate": 1.7902959966606954e-05, "loss": 2.3103, "step": 395 }, { "epoch": 0.29, "grad_norm": 2.71875, "learning_rate": 1.7824003623747984e-05, "loss": 2.2717, "step": 400 }, { "epoch": 0.3, "grad_norm": 2.46875, "learning_rate": 1.7743769197709836e-05, "loss": 2.313, "step": 405 }, { "epoch": 0.3, "grad_norm": 2.234375, "learning_rate": 1.7662269795116093e-05, "loss": 2.2838, "step": 410 }, { "epoch": 0.3, "grad_norm": 2.28125, "learning_rate": 1.757951872922944e-05, "loss": 2.2888, "step": 415 }, { "epoch": 0.31, "grad_norm": 2.3125, "learning_rate": 1.749552951777693e-05, "loss": 2.2648, "step": 420 }, { "epoch": 0.31, "grad_norm": 2.921875, "learning_rate": 1.7410315880741765e-05, "loss": 2.3002, "step": 425 }, { "epoch": 0.31, "grad_norm": 2.1875, "learning_rate": 1.7323891738122126e-05, "loss": 2.2758, "step": 430 }, { "epoch": 0.32, "grad_norm": 2.5, "learning_rate": 1.7236271207657246e-05, "loss": 2.3409, "step": 435 }, { "epoch": 0.32, "grad_norm": 2.390625, "learning_rate": 1.714746860252124e-05, "loss": 2.3065, "step": 440 }, { "epoch": 0.33, "grad_norm": 2.171875, "learning_rate": 1.7057498428984988e-05, "loss": 2.2724, "step": 445 }, { "epoch": 0.33, "grad_norm": 2.40625, "learning_rate": 1.696637538404646e-05, "loss": 2.2881, "step": 450 }, { "epoch": 0.33, "grad_norm": 2.328125, "learning_rate": 1.6874114353029915e-05, "loss": 2.3293, "step": 455 }, { "epoch": 0.34, "grad_norm": 2.84375, "learning_rate": 1.678073040715433e-05, "loss": 2.2792, "step": 460 }, { "epoch": 0.34, "grad_norm": 2.21875, "learning_rate": 1.6686238801071435e-05, "loss": 2.2359, "step": 465 }, { "epoch": 0.34, "grad_norm": 2.265625, "learning_rate": 1.659065497037381e-05, "loss": 2.2881, "step": 470 }, { "epoch": 0.35, "grad_norm": 2.09375, "learning_rate": 1.649399452907343e-05, "loss": 2.2918, "step": 475 }, { "epoch": 0.35, "grad_norm": 2.4375, "learning_rate": 1.639627326705103e-05, "loss": 2.2801, "step": 480 }, { "epoch": 0.36, "grad_norm": 2.265625, "learning_rate": 1.629750714747677e-05, "loss": 2.2948, "step": 485 }, { "epoch": 0.36, "grad_norm": 2.546875, "learning_rate": 1.619771230420261e-05, "loss": 2.2611, "step": 490 }, { "epoch": 0.36, "grad_norm": 2.359375, "learning_rate": 1.609690503912674e-05, "loss": 2.3027, "step": 495 }, { "epoch": 0.37, "grad_norm": 2.25, "learning_rate": 1.599510181953062e-05, "loss": 2.2495, "step": 500 }, { "epoch": 0.37, "grad_norm": 2.40625, "learning_rate": 1.589231927538898e-05, "loss": 2.2587, "step": 505 }, { "epoch": 0.37, "grad_norm": 2.265625, "learning_rate": 1.578857419665325e-05, "loss": 2.2614, "step": 510 }, { "epoch": 0.38, "grad_norm": 2.34375, "learning_rate": 1.568388353050883e-05, "loss": 2.2618, "step": 515 }, { "epoch": 0.38, "grad_norm": 2.21875, "learning_rate": 1.5578264378606727e-05, "loss": 2.3015, "step": 520 }, { "epoch": 0.38, "grad_norm": 2.296875, "learning_rate": 1.5471733994269916e-05, "loss": 2.2542, "step": 525 }, { "epoch": 0.39, "grad_norm": 2.34375, "learning_rate": 1.5364309779674925e-05, "loss": 2.2573, "step": 530 }, { "epoch": 0.39, "grad_norm": 2.390625, "learning_rate": 1.5256009283009148e-05, "loss": 2.291, "step": 535 }, { "epoch": 0.4, "grad_norm": 2.640625, "learning_rate": 1.5146850195604255e-05, "loss": 2.2856, "step": 540 }, { "epoch": 0.4, "grad_norm": 2.21875, "learning_rate": 1.5036850349046236e-05, "loss": 2.2403, "step": 545 }, { "epoch": 0.4, "grad_norm": 2.265625, "learning_rate": 1.4926027712262565e-05, "loss": 2.324, "step": 550 }, { "epoch": 0.41, "grad_norm": 2.21875, "learning_rate": 1.4814400388586871e-05, "loss": 2.2583, "step": 555 }, { "epoch": 0.41, "grad_norm": 2.21875, "learning_rate": 1.4701986612801699e-05, "loss": 2.2787, "step": 560 }, { "epoch": 0.41, "grad_norm": 2.28125, "learning_rate": 1.4588804748159783e-05, "loss": 2.2246, "step": 565 }, { "epoch": 0.42, "grad_norm": 2.234375, "learning_rate": 1.447487328338434e-05, "loss": 2.3084, "step": 570 }, { "epoch": 0.42, "grad_norm": 2.15625, "learning_rate": 1.4360210829648834e-05, "loss": 2.2681, "step": 575 }, { "epoch": 0.42, "grad_norm": 2.40625, "learning_rate": 1.4244836117536797e-05, "loss": 2.2876, "step": 580 }, { "epoch": 0.43, "grad_norm": 2.390625, "learning_rate": 1.4128767993982097e-05, "loss": 2.2373, "step": 585 }, { "epoch": 0.43, "grad_norm": 2.3125, "learning_rate": 1.401202541919021e-05, "loss": 2.2318, "step": 590 }, { "epoch": 0.44, "grad_norm": 2.28125, "learning_rate": 1.389462746354099e-05, "loss": 2.2876, "step": 595 }, { "epoch": 0.44, "grad_norm": 2.28125, "learning_rate": 1.3776593304473464e-05, "loss": 2.2743, "step": 600 }, { "epoch": 0.44, "grad_norm": 2.25, "learning_rate": 1.3657942223353093e-05, "loss": 2.3012, "step": 605 }, { "epoch": 0.45, "grad_norm": 2.28125, "learning_rate": 1.3538693602322102e-05, "loss": 2.2909, "step": 610 }, { "epoch": 0.45, "grad_norm": 2.3125, "learning_rate": 1.3418866921133309e-05, "loss": 2.2919, "step": 615 }, { "epoch": 0.45, "grad_norm": 2.09375, "learning_rate": 1.3298481753968044e-05, "loss": 2.2759, "step": 620 }, { "epoch": 0.46, "grad_norm": 2.421875, "learning_rate": 1.317755776623862e-05, "loss": 2.2374, "step": 625 }, { "epoch": 0.46, "grad_norm": 2.328125, "learning_rate": 1.3056114711375898e-05, "loss": 2.2749, "step": 630 }, { "epoch": 0.46, "grad_norm": 2.171875, "learning_rate": 1.2934172427602499e-05, "loss": 2.2678, "step": 635 }, { "epoch": 0.47, "grad_norm": 2.390625, "learning_rate": 1.2811750834692134e-05, "loss": 2.2902, "step": 640 }, { "epoch": 0.47, "grad_norm": 2.25, "learning_rate": 1.268886993071564e-05, "loss": 2.2685, "step": 645 }, { "epoch": 0.48, "grad_norm": 2.1875, "learning_rate": 1.2565549788774198e-05, "loss": 2.2695, "step": 650 }, { "epoch": 0.48, "grad_norm": 2.28125, "learning_rate": 1.2441810553720324e-05, "loss": 2.2628, "step": 655 }, { "epoch": 0.48, "grad_norm": 2.59375, "learning_rate": 1.2317672438867114e-05, "loss": 2.2308, "step": 660 }, { "epoch": 0.49, "grad_norm": 2.125, "learning_rate": 1.2193155722686326e-05, "loss": 2.2787, "step": 665 }, { "epoch": 0.49, "grad_norm": 2.359375, "learning_rate": 1.2068280745495797e-05, "loss": 2.2848, "step": 670 }, { "epoch": 0.49, "grad_norm": 2.328125, "learning_rate": 1.1943067906136772e-05, "loss": 2.2716, "step": 675 }, { "epoch": 0.5, "grad_norm": 2.328125, "learning_rate": 1.1817537658641677e-05, "loss": 2.2833, "step": 680 }, { "epoch": 0.5, "grad_norm": 2.28125, "learning_rate": 1.1691710508892859e-05, "loss": 2.2403, "step": 685 }, { "epoch": 0.51, "grad_norm": 2.265625, "learning_rate": 1.1565607011272874e-05, "loss": 2.2913, "step": 690 }, { "epoch": 0.51, "grad_norm": 2.28125, "learning_rate": 1.1439247765306835e-05, "loss": 2.2717, "step": 695 }, { "epoch": 0.51, "grad_norm": 2.109375, "learning_rate": 1.13126534122974e-05, "loss": 2.255, "step": 700 }, { "epoch": 0.52, "grad_norm": 2.359375, "learning_rate": 1.1185844631952926e-05, "loss": 2.2996, "step": 705 }, { "epoch": 0.52, "grad_norm": 2.25, "learning_rate": 1.1058842139009369e-05, "loss": 2.2673, "step": 710 }, { "epoch": 0.52, "grad_norm": 2.234375, "learning_rate": 1.0931666679846427e-05, "loss": 2.2508, "step": 715 }, { "epoch": 0.53, "grad_norm": 2.296875, "learning_rate": 1.0804339029098542e-05, "loss": 2.2279, "step": 720 }, { "epoch": 0.53, "grad_norm": 2.296875, "learning_rate": 1.0676879986261274e-05, "loss": 2.2841, "step": 725 }, { "epoch": 0.53, "grad_norm": 2.09375, "learning_rate": 1.0549310372293631e-05, "loss": 2.2626, "step": 730 }, { "epoch": 0.54, "grad_norm": 2.15625, "learning_rate": 1.0421651026216859e-05, "loss": 2.2737, "step": 735 }, { "epoch": 0.54, "grad_norm": 2.28125, "learning_rate": 1.0293922801710322e-05, "loss": 2.2713, "step": 740 }, { "epoch": 0.55, "grad_norm": 2.40625, "learning_rate": 1.0166146563704953e-05, "loss": 2.2836, "step": 745 }, { "epoch": 0.55, "grad_norm": 2.125, "learning_rate": 1.003834318497489e-05, "loss": 2.2993, "step": 750 }, { "epoch": 0.55, "grad_norm": 2.734375, "learning_rate": 9.910533542727826e-06, "loss": 2.2349, "step": 755 }, { "epoch": 0.56, "grad_norm": 2.125, "learning_rate": 9.782738515194611e-06, "loss": 2.2954, "step": 760 }, { "epoch": 0.56, "grad_norm": 2.09375, "learning_rate": 9.654978978218735e-06, "loss": 2.2879, "step": 765 }, { "epoch": 0.56, "grad_norm": 2.4375, "learning_rate": 9.527275801846148e-06, "loss": 2.2637, "step": 770 }, { "epoch": 0.57, "grad_norm": 2.15625, "learning_rate": 9.399649846916075e-06, "loss": 2.2558, "step": 775 }, { "epoch": 0.57, "grad_norm": 2.265625, "learning_rate": 9.272121961653293e-06, "loss": 2.2513, "step": 780 }, { "epoch": 0.57, "grad_norm": 2.421875, "learning_rate": 9.1447129782625e-06, "loss": 2.2602, "step": 785 }, { "epoch": 0.58, "grad_norm": 2.515625, "learning_rate": 9.017443709525278e-06, "loss": 2.2518, "step": 790 }, { "epoch": 0.58, "grad_norm": 2.96875, "learning_rate": 8.890334945400256e-06, "loss": 2.2511, "step": 795 }, { "epoch": 0.59, "grad_norm": 2.1875, "learning_rate": 8.763407449626956e-06, "loss": 2.2424, "step": 800 }, { "epoch": 0.59, "grad_norm": 2.1875, "learning_rate": 8.636681956333992e-06, "loss": 2.2414, "step": 805 }, { "epoch": 0.59, "grad_norm": 2.28125, "learning_rate": 8.510179166652023e-06, "loss": 2.2786, "step": 810 }, { "epoch": 0.6, "grad_norm": 2.3125, "learning_rate": 8.383919745332171e-06, "loss": 2.2856, "step": 815 }, { "epoch": 0.6, "grad_norm": 1.984375, "learning_rate": 8.257924317370323e-06, "loss": 2.283, "step": 820 }, { "epoch": 0.6, "grad_norm": 2.46875, "learning_rate": 8.132213464637972e-06, "loss": 2.2562, "step": 825 }, { "epoch": 0.61, "grad_norm": 2.0625, "learning_rate": 8.006807722520073e-06, "loss": 2.2338, "step": 830 }, { "epoch": 0.61, "grad_norm": 2.484375, "learning_rate": 7.881727576560513e-06, "loss": 2.2892, "step": 835 }, { "epoch": 0.61, "grad_norm": 2.203125, "learning_rate": 7.756993459115696e-06, "loss": 2.2578, "step": 840 }, { "epoch": 0.62, "grad_norm": 2.546875, "learning_rate": 7.632625746016859e-06, "loss": 2.2318, "step": 845 }, { "epoch": 0.62, "grad_norm": 2.125, "learning_rate": 7.508644753241568e-06, "loss": 2.2991, "step": 850 }, { "epoch": 0.63, "grad_norm": 2.234375, "learning_rate": 7.385070733595044e-06, "loss": 2.2888, "step": 855 }, { "epoch": 0.63, "grad_norm": 2.21875, "learning_rate": 7.261923873401762e-06, "loss": 2.3097, "step": 860 }, { "epoch": 0.63, "grad_norm": 2.375, "learning_rate": 7.139224289207958e-06, "loss": 2.2517, "step": 865 }, { "epoch": 0.64, "grad_norm": 2.203125, "learning_rate": 7.016992024495475e-06, "loss": 2.2512, "step": 870 }, { "epoch": 0.64, "grad_norm": 3.015625, "learning_rate": 6.895247046407616e-06, "loss": 2.2245, "step": 875 }, { "epoch": 0.64, "grad_norm": 2.171875, "learning_rate": 6.774009242487393e-06, "loss": 2.2485, "step": 880 }, { "epoch": 0.65, "grad_norm": 2.765625, "learning_rate": 6.653298417428841e-06, "loss": 2.2257, "step": 885 }, { "epoch": 0.65, "grad_norm": 2.109375, "learning_rate": 6.533134289841822e-06, "loss": 2.2616, "step": 890 }, { "epoch": 0.66, "grad_norm": 2.25, "learning_rate": 6.4135364890309245e-06, "loss": 2.2599, "step": 895 }, { "epoch": 0.66, "grad_norm": 2.15625, "learning_rate": 6.294524551788928e-06, "loss": 2.2262, "step": 900 }, { "epoch": 0.66, "grad_norm": 2.078125, "learning_rate": 6.176117919205378e-06, "loss": 2.2752, "step": 905 }, { "epoch": 0.67, "grad_norm": 2.09375, "learning_rate": 6.0583359334908275e-06, "loss": 2.2541, "step": 910 }, { "epoch": 0.67, "grad_norm": 2.375, "learning_rate": 5.941197834817183e-06, "loss": 2.2642, "step": 915 }, { "epoch": 0.67, "grad_norm": 2.421875, "learning_rate": 5.824722758174763e-06, "loss": 2.2879, "step": 920 }, { "epoch": 0.68, "grad_norm": 2.34375, "learning_rate": 5.708929730246501e-06, "loss": 2.2794, "step": 925 }, { "epoch": 0.68, "grad_norm": 2.21875, "learning_rate": 5.593837666299871e-06, "loss": 2.2787, "step": 930 }, { "epoch": 0.68, "grad_norm": 2.109375, "learning_rate": 5.479465367096999e-06, "loss": 2.2598, "step": 935 }, { "epoch": 0.69, "grad_norm": 2.25, "learning_rate": 5.365831515823478e-06, "loss": 2.2633, "step": 940 }, { "epoch": 0.69, "grad_norm": 2.3125, "learning_rate": 5.252954675036384e-06, "loss": 2.2602, "step": 945 }, { "epoch": 0.7, "grad_norm": 2.28125, "learning_rate": 5.140853283632039e-06, "loss": 2.2585, "step": 950 }, { "epoch": 0.7, "grad_norm": 2.3125, "learning_rate": 5.029545653833887e-06, "loss": 2.279, "step": 955 }, { "epoch": 0.7, "grad_norm": 2.125, "learning_rate": 4.919049968201182e-06, "loss": 2.264, "step": 960 }, { "epoch": 0.71, "grad_norm": 2.15625, "learning_rate": 4.809384276658728e-06, "loss": 2.2765, "step": 965 }, { "epoch": 0.71, "grad_norm": 2.578125, "learning_rate": 4.7005664935483995e-06, "loss": 2.273, "step": 970 }, { "epoch": 0.71, "grad_norm": 2.25, "learning_rate": 4.592614394702731e-06, "loss": 2.2695, "step": 975 }, { "epoch": 0.72, "grad_norm": 2.234375, "learning_rate": 4.4855456145411754e-06, "loss": 2.2415, "step": 980 }, { "epoch": 0.72, "grad_norm": 2.328125, "learning_rate": 4.379377643189444e-06, "loss": 2.2602, "step": 985 }, { "epoch": 0.72, "grad_norm": 2.03125, "learning_rate": 4.27412782362242e-06, "loss": 2.2589, "step": 990 }, { "epoch": 0.73, "grad_norm": 2.109375, "learning_rate": 4.169813348831121e-06, "loss": 2.2712, "step": 995 }, { "epoch": 0.73, "grad_norm": 2.5, "learning_rate": 4.0664512590141415e-06, "loss": 2.27, "step": 1000 }, { "epoch": 0.74, "grad_norm": 2.1875, "learning_rate": 3.9640584387940765e-06, "loss": 2.2953, "step": 1005 }, { "epoch": 0.74, "grad_norm": 2.203125, "learning_rate": 3.862651614459346e-06, "loss": 2.2796, "step": 1010 }, { "epoch": 0.74, "grad_norm": 2.53125, "learning_rate": 3.7622473512318745e-06, "loss": 2.286, "step": 1015 }, { "epoch": 0.75, "grad_norm": 2.234375, "learning_rate": 3.662862050561128e-06, "loss": 2.274, "step": 1020 }, { "epoch": 0.75, "grad_norm": 2.140625, "learning_rate": 3.564511947444821e-06, "loss": 2.2617, "step": 1025 }, { "epoch": 0.75, "grad_norm": 2.1875, "learning_rate": 3.4672131077769056e-06, "loss": 2.2636, "step": 1030 }, { "epoch": 0.76, "grad_norm": 2.3125, "learning_rate": 3.37098142572312e-06, "loss": 2.2809, "step": 1035 }, { "epoch": 0.76, "grad_norm": 2.328125, "learning_rate": 3.275832621124615e-06, "loss": 2.2541, "step": 1040 }, { "epoch": 0.77, "grad_norm": 2.46875, "learning_rate": 3.18178223693006e-06, "loss": 2.297, "step": 1045 }, { "epoch": 0.77, "grad_norm": 2.5, "learning_rate": 3.0888456366566165e-06, "loss": 2.2861, "step": 1050 }, { "epoch": 0.77, "grad_norm": 2.15625, "learning_rate": 2.997038001880287e-06, "loss": 2.2931, "step": 1055 }, { "epoch": 0.78, "grad_norm": 2.125, "learning_rate": 2.9063743297558843e-06, "loss": 2.2441, "step": 1060 }, { "epoch": 0.78, "grad_norm": 2.34375, "learning_rate": 2.8168694305672473e-06, "loss": 2.2687, "step": 1065 }, { "epoch": 0.78, "grad_norm": 2.15625, "learning_rate": 2.7285379253078626e-06, "loss": 2.2732, "step": 1070 }, { "epoch": 0.79, "grad_norm": 2.1875, "learning_rate": 2.6413942432925065e-06, "loss": 2.2441, "step": 1075 }, { "epoch": 0.79, "grad_norm": 2.140625, "learning_rate": 2.555452619800135e-06, "loss": 2.3106, "step": 1080 }, { "epoch": 0.79, "grad_norm": 2.046875, "learning_rate": 2.4707270937485038e-06, "loss": 2.2372, "step": 1085 }, { "epoch": 0.8, "grad_norm": 1.96875, "learning_rate": 2.3872315054008457e-06, "loss": 2.2571, "step": 1090 }, { "epoch": 0.8, "grad_norm": 2.140625, "learning_rate": 2.304979494105011e-06, "loss": 2.2538, "step": 1095 }, { "epoch": 0.81, "grad_norm": 2.234375, "learning_rate": 2.22398449606542e-06, "loss": 2.2863, "step": 1100 }, { "epoch": 0.81, "grad_norm": 2.203125, "learning_rate": 2.1442597421482124e-06, "loss": 2.2811, "step": 1105 }, { "epoch": 0.81, "grad_norm": 2.15625, "learning_rate": 2.0658182557199137e-06, "loss": 2.2884, "step": 1110 }, { "epoch": 0.82, "grad_norm": 2.1875, "learning_rate": 1.9886728505200436e-06, "loss": 2.2409, "step": 1115 }, { "epoch": 0.82, "grad_norm": 2.296875, "learning_rate": 1.9128361285678986e-06, "loss": 2.2267, "step": 1120 }, { "epoch": 0.82, "grad_norm": 2.171875, "learning_rate": 1.8383204781040065e-06, "loss": 2.2654, "step": 1125 }, { "epoch": 0.83, "grad_norm": 2.0625, "learning_rate": 1.7651380715664124e-06, "loss": 2.3065, "step": 1130 }, { "epoch": 0.83, "grad_norm": 2.109375, "learning_rate": 1.693300863602294e-06, "loss": 2.25, "step": 1135 }, { "epoch": 0.83, "grad_norm": 2.484375, "learning_rate": 1.6228205891151027e-06, "loss": 2.2566, "step": 1140 }, { "epoch": 0.84, "grad_norm": 2.28125, "learning_rate": 1.5537087613476255e-06, "loss": 2.2508, "step": 1145 }, { "epoch": 0.84, "grad_norm": 2.328125, "learning_rate": 1.4859766700012478e-06, "loss": 2.2346, "step": 1150 }, { "epoch": 0.85, "grad_norm": 2.78125, "learning_rate": 1.419635379391736e-06, "loss": 2.2482, "step": 1155 }, { "epoch": 0.85, "grad_norm": 2.1875, "learning_rate": 1.3546957266418315e-06, "loss": 2.2644, "step": 1160 }, { "epoch": 0.85, "grad_norm": 2.203125, "learning_rate": 1.291168319910967e-06, "loss": 2.2521, "step": 1165 }, { "epoch": 0.86, "grad_norm": 2.09375, "learning_rate": 1.2290635366623805e-06, "loss": 2.2304, "step": 1170 }, { "epoch": 0.86, "grad_norm": 2.15625, "learning_rate": 1.1683915219679054e-06, "loss": 2.2861, "step": 1175 }, { "epoch": 0.86, "grad_norm": 1.96875, "learning_rate": 1.1091621868507418e-06, "loss": 2.287, "step": 1180 }, { "epoch": 0.87, "grad_norm": 2.234375, "learning_rate": 1.0513852066664454e-06, "loss": 2.2172, "step": 1185 }, { "epoch": 0.87, "grad_norm": 2.5, "learning_rate": 9.950700195224194e-07, "loss": 2.2312, "step": 1190 }, { "epoch": 0.87, "grad_norm": 2.578125, "learning_rate": 9.402258247361584e-07, "loss": 2.2529, "step": 1195 }, { "epoch": 0.88, "grad_norm": 2.390625, "learning_rate": 8.868615813325054e-07, "loss": 2.268, "step": 1200 }, { "epoch": 0.88, "grad_norm": 2.109375, "learning_rate": 8.349860065801563e-07, "loss": 2.2757, "step": 1205 }, { "epoch": 0.89, "grad_norm": 2.375, "learning_rate": 7.846075745676584e-07, "loss": 2.2693, "step": 1210 }, { "epoch": 0.89, "grad_norm": 2.1875, "learning_rate": 7.357345148191297e-07, "loss": 2.2866, "step": 1215 }, { "epoch": 0.89, "grad_norm": 2.21875, "learning_rate": 6.883748109499455e-07, "loss": 2.2496, "step": 1220 }, { "epoch": 0.9, "grad_norm": 2.28125, "learning_rate": 6.425361993625589e-07, "loss": 2.2746, "step": 1225 }, { "epoch": 0.9, "grad_norm": 2.1875, "learning_rate": 5.982261679827561e-07, "loss": 2.2776, "step": 1230 }, { "epoch": 0.9, "grad_norm": 2.015625, "learning_rate": 5.554519550364456e-07, "loss": 2.2243, "step": 1235 }, { "epoch": 0.91, "grad_norm": 1.9765625, "learning_rate": 5.142205478672824e-07, "loss": 2.2735, "step": 1240 }, { "epoch": 0.91, "grad_norm": 2.203125, "learning_rate": 4.745386817952502e-07, "loss": 2.2766, "step": 1245 }, { "epoch": 0.92, "grad_norm": 2.28125, "learning_rate": 4.3641283901641794e-07, "loss": 2.2487, "step": 1250 }, { "epoch": 0.92, "grad_norm": 2.515625, "learning_rate": 3.9984924754405296e-07, "loss": 2.2951, "step": 1255 }, { "epoch": 0.92, "grad_norm": 2.265625, "learning_rate": 3.6485388019123826e-07, "loss": 2.2378, "step": 1260 }, { "epoch": 0.93, "grad_norm": 2.40625, "learning_rate": 3.314324535952007e-07, "loss": 2.281, "step": 1265 }, { "epoch": 0.93, "grad_norm": 2.234375, "learning_rate": 2.995904272834671e-07, "loss": 2.26, "step": 1270 }, { "epoch": 0.93, "grad_norm": 2.4375, "learning_rate": 2.6933300278202844e-07, "loss": 2.2277, "step": 1275 }, { "epoch": 0.94, "grad_norm": 2.078125, "learning_rate": 2.406651227656576e-07, "loss": 2.266, "step": 1280 }, { "epoch": 0.94, "grad_norm": 2.515625, "learning_rate": 2.135914702504871e-07, "loss": 2.2392, "step": 1285 }, { "epoch": 0.94, "grad_norm": 2.15625, "learning_rate": 1.8811646782903814e-07, "loss": 2.2394, "step": 1290 }, { "epoch": 0.95, "grad_norm": 2.0625, "learning_rate": 1.6424427694775702e-07, "loss": 2.2977, "step": 1295 }, { "epoch": 0.95, "grad_norm": 2.015625, "learning_rate": 1.4197879722723018e-07, "loss": 2.2837, "step": 1300 }, { "epoch": 0.96, "grad_norm": 2.046875, "learning_rate": 1.213236658251704e-07, "loss": 2.2568, "step": 1305 }, { "epoch": 0.96, "grad_norm": 2.375, "learning_rate": 1.0228225684226434e-07, "loss": 2.2856, "step": 1310 }, { "epoch": 0.96, "grad_norm": 1.953125, "learning_rate": 8.485768077100443e-08, "loss": 2.2729, "step": 1315 }, { "epoch": 0.97, "grad_norm": 2.234375, "learning_rate": 6.905278398757432e-08, "loss": 2.2387, "step": 1320 }, { "epoch": 0.97, "grad_norm": 2.34375, "learning_rate": 5.48701482868852e-08, "loss": 2.2797, "step": 1325 }, { "epoch": 0.97, "grad_norm": 2.21875, "learning_rate": 4.231209046082207e-08, "loss": 2.2219, "step": 1330 }, { "epoch": 0.98, "grad_norm": 2.40625, "learning_rate": 3.1380661919796495e-08, "loss": 2.2422, "step": 1335 }, { "epoch": 0.98, "grad_norm": 2.3125, "learning_rate": 2.2077648357629044e-08, "loss": 2.2803, "step": 1340 }, { "epoch": 0.98, "grad_norm": 2.09375, "learning_rate": 1.4404569459858242e-08, "loss": 2.2881, "step": 1345 }, { "epoch": 0.99, "grad_norm": 2.25, "learning_rate": 8.36267865548912e-09, "loss": 2.2251, "step": 1350 }, { "epoch": 0.99, "grad_norm": 2.265625, "learning_rate": 3.952962912238123e-09, "loss": 2.2907, "step": 1355 }, { "epoch": 1.0, "grad_norm": 2.515625, "learning_rate": 1.1761425753142875e-09, "loss": 2.2715, "step": 1360 }, { "epoch": 1.0, "grad_norm": 2.375, "learning_rate": 3.2671249743376766e-11, "loss": 2.1966, "step": 1365 }, { "epoch": 1.0, "eval_loss": 2.1976709365844727, "eval_runtime": 128.8685, "eval_samples_per_second": 21.534, "eval_steps_per_second": 2.693, "step": 1366 }, { "epoch": 1.0, "step": 1366, "total_flos": 2.02921101754368e+16, "train_loss": 0.6064129045034048, "train_runtime": 690.5421, "train_samples_per_second": 15.821, "train_steps_per_second": 1.978 } ], "logging_steps": 5, "max_steps": 1366, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 2.02921101754368e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }