{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9998306376492505, "eval_steps": 500, "global_step": 11808, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.484375, "learning_rate": 1.6934801016088062e-07, "loss": 2.5162, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.40234375, "learning_rate": 8.46740050804403e-07, "loss": 2.6069, "step": 5 }, { "epoch": 0.0, "grad_norm": 0.515625, "learning_rate": 1.693480101608806e-06, "loss": 2.5858, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.5859375, "learning_rate": 2.5402201524132094e-06, "loss": 2.5739, "step": 15 }, { "epoch": 0.0, "grad_norm": 0.404296875, "learning_rate": 3.386960203217612e-06, "loss": 2.588, "step": 20 }, { "epoch": 0.0, "grad_norm": 0.46484375, "learning_rate": 4.233700254022015e-06, "loss": 2.5688, "step": 25 }, { "epoch": 0.01, "grad_norm": 0.5859375, "learning_rate": 5.080440304826419e-06, "loss": 2.5694, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.77734375, "learning_rate": 5.927180355630822e-06, "loss": 2.5357, "step": 35 }, { "epoch": 0.01, "grad_norm": 0.44921875, "learning_rate": 6.773920406435224e-06, "loss": 2.5469, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.5234375, "learning_rate": 7.620660457239629e-06, "loss": 2.5776, "step": 45 }, { "epoch": 0.01, "grad_norm": 0.55859375, "learning_rate": 8.46740050804403e-06, "loss": 2.5495, "step": 50 }, { "epoch": 0.01, "grad_norm": 0.57421875, "learning_rate": 9.314140558848434e-06, "loss": 2.489, "step": 55 }, { "epoch": 0.01, "grad_norm": 0.427734375, "learning_rate": 1.0160880609652838e-05, "loss": 2.5028, "step": 60 }, { "epoch": 0.01, "grad_norm": 0.37890625, "learning_rate": 1.1007620660457241e-05, "loss": 2.5345, "step": 65 }, { "epoch": 0.01, "grad_norm": 0.353515625, "learning_rate": 1.1854360711261643e-05, "loss": 2.4845, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.28515625, "learning_rate": 1.2701100762066045e-05, "loss": 2.4452, "step": 75 }, { "epoch": 0.01, "grad_norm": 0.267578125, "learning_rate": 1.3547840812870449e-05, "loss": 2.455, "step": 80 }, { "epoch": 0.01, "grad_norm": 0.2421875, "learning_rate": 1.4394580863674852e-05, "loss": 2.5327, "step": 85 }, { "epoch": 0.02, "grad_norm": 0.2265625, "learning_rate": 1.5241320914479258e-05, "loss": 2.4376, "step": 90 }, { "epoch": 0.02, "grad_norm": 0.2275390625, "learning_rate": 1.608806096528366e-05, "loss": 2.4626, "step": 95 }, { "epoch": 0.02, "grad_norm": 0.2265625, "learning_rate": 1.693480101608806e-05, "loss": 2.4267, "step": 100 }, { "epoch": 0.02, "grad_norm": 0.2177734375, "learning_rate": 1.7781541066892467e-05, "loss": 2.4574, "step": 105 }, { "epoch": 0.02, "grad_norm": 0.2333984375, "learning_rate": 1.862828111769687e-05, "loss": 2.3983, "step": 110 }, { "epoch": 0.02, "grad_norm": 0.2421875, "learning_rate": 1.947502116850127e-05, "loss": 2.4354, "step": 115 }, { "epoch": 0.02, "grad_norm": 0.232421875, "learning_rate": 2.0321761219305676e-05, "loss": 2.4131, "step": 120 }, { "epoch": 0.02, "grad_norm": 0.2080078125, "learning_rate": 2.1168501270110077e-05, "loss": 2.4226, "step": 125 }, { "epoch": 0.02, "grad_norm": 0.208984375, "learning_rate": 2.2015241320914483e-05, "loss": 2.4228, "step": 130 }, { "epoch": 0.02, "grad_norm": 0.2060546875, "learning_rate": 2.286198137171888e-05, "loss": 2.3872, "step": 135 }, { "epoch": 0.02, "grad_norm": 0.1904296875, "learning_rate": 2.3708721422523286e-05, "loss": 2.3907, "step": 140 }, { "epoch": 0.02, "grad_norm": 0.2001953125, "learning_rate": 2.455546147332769e-05, "loss": 2.3826, "step": 145 }, { "epoch": 0.03, "grad_norm": 0.2119140625, "learning_rate": 2.540220152413209e-05, "loss": 2.3294, "step": 150 }, { "epoch": 0.03, "grad_norm": 0.197265625, "learning_rate": 2.62489415749365e-05, "loss": 2.3485, "step": 155 }, { "epoch": 0.03, "grad_norm": 0.2236328125, "learning_rate": 2.7095681625740897e-05, "loss": 2.3282, "step": 160 }, { "epoch": 0.03, "grad_norm": 0.216796875, "learning_rate": 2.79424216765453e-05, "loss": 2.3451, "step": 165 }, { "epoch": 0.03, "grad_norm": 0.19921875, "learning_rate": 2.8789161727349705e-05, "loss": 2.2971, "step": 170 }, { "epoch": 0.03, "grad_norm": 0.21484375, "learning_rate": 2.9635901778154106e-05, "loss": 2.3093, "step": 175 }, { "epoch": 0.03, "grad_norm": 0.232421875, "learning_rate": 3.0482641828958515e-05, "loss": 2.2861, "step": 180 }, { "epoch": 0.03, "grad_norm": 0.205078125, "learning_rate": 3.132938187976292e-05, "loss": 2.3381, "step": 185 }, { "epoch": 0.03, "grad_norm": 0.1845703125, "learning_rate": 3.217612193056732e-05, "loss": 2.269, "step": 190 }, { "epoch": 0.03, "grad_norm": 0.185546875, "learning_rate": 3.302286198137172e-05, "loss": 2.3178, "step": 195 }, { "epoch": 0.03, "grad_norm": 0.181640625, "learning_rate": 3.386960203217612e-05, "loss": 2.2231, "step": 200 }, { "epoch": 0.03, "grad_norm": 0.173828125, "learning_rate": 3.4716342082980524e-05, "loss": 2.2515, "step": 205 }, { "epoch": 0.04, "grad_norm": 0.1865234375, "learning_rate": 3.556308213378493e-05, "loss": 2.3124, "step": 210 }, { "epoch": 0.04, "grad_norm": 0.1826171875, "learning_rate": 3.6409822184589335e-05, "loss": 2.2382, "step": 215 }, { "epoch": 0.04, "grad_norm": 0.1884765625, "learning_rate": 3.725656223539374e-05, "loss": 2.2292, "step": 220 }, { "epoch": 0.04, "grad_norm": 0.1650390625, "learning_rate": 3.810330228619814e-05, "loss": 2.2462, "step": 225 }, { "epoch": 0.04, "grad_norm": 0.171875, "learning_rate": 3.895004233700254e-05, "loss": 2.2408, "step": 230 }, { "epoch": 0.04, "grad_norm": 0.1884765625, "learning_rate": 3.979678238780695e-05, "loss": 2.2485, "step": 235 }, { "epoch": 0.04, "grad_norm": 0.1884765625, "learning_rate": 4.064352243861135e-05, "loss": 2.319, "step": 240 }, { "epoch": 0.04, "grad_norm": 0.1904296875, "learning_rate": 4.1490262489415746e-05, "loss": 2.2125, "step": 245 }, { "epoch": 0.04, "grad_norm": 0.1826171875, "learning_rate": 4.2337002540220155e-05, "loss": 2.2627, "step": 250 }, { "epoch": 0.04, "grad_norm": 0.1826171875, "learning_rate": 4.318374259102456e-05, "loss": 2.2665, "step": 255 }, { "epoch": 0.04, "grad_norm": 0.1904296875, "learning_rate": 4.4030482641828965e-05, "loss": 2.2375, "step": 260 }, { "epoch": 0.04, "grad_norm": 0.1884765625, "learning_rate": 4.487722269263336e-05, "loss": 2.2342, "step": 265 }, { "epoch": 0.05, "grad_norm": 0.189453125, "learning_rate": 4.572396274343776e-05, "loss": 2.2752, "step": 270 }, { "epoch": 0.05, "grad_norm": 0.18359375, "learning_rate": 4.657070279424217e-05, "loss": 2.2579, "step": 275 }, { "epoch": 0.05, "grad_norm": 0.181640625, "learning_rate": 4.741744284504657e-05, "loss": 2.2639, "step": 280 }, { "epoch": 0.05, "grad_norm": 0.1953125, "learning_rate": 4.8264182895850975e-05, "loss": 2.2554, "step": 285 }, { "epoch": 0.05, "grad_norm": 0.185546875, "learning_rate": 4.911092294665538e-05, "loss": 2.2714, "step": 290 }, { "epoch": 0.05, "grad_norm": 0.18359375, "learning_rate": 4.995766299745978e-05, "loss": 2.2491, "step": 295 }, { "epoch": 0.05, "grad_norm": 0.1806640625, "learning_rate": 5.080440304826418e-05, "loss": 2.2583, "step": 300 }, { "epoch": 0.05, "grad_norm": 0.19921875, "learning_rate": 5.165114309906859e-05, "loss": 2.2448, "step": 305 }, { "epoch": 0.05, "grad_norm": 0.19140625, "learning_rate": 5.2497883149873e-05, "loss": 2.2126, "step": 310 }, { "epoch": 0.05, "grad_norm": 0.1845703125, "learning_rate": 5.334462320067739e-05, "loss": 2.2718, "step": 315 }, { "epoch": 0.05, "grad_norm": 0.1943359375, "learning_rate": 5.4191363251481795e-05, "loss": 2.2236, "step": 320 }, { "epoch": 0.06, "grad_norm": 0.1943359375, "learning_rate": 5.5038103302286203e-05, "loss": 2.2657, "step": 325 }, { "epoch": 0.06, "grad_norm": 0.1943359375, "learning_rate": 5.58848433530906e-05, "loss": 2.2426, "step": 330 }, { "epoch": 0.06, "grad_norm": 0.193359375, "learning_rate": 5.673158340389501e-05, "loss": 2.2843, "step": 335 }, { "epoch": 0.06, "grad_norm": 0.193359375, "learning_rate": 5.757832345469941e-05, "loss": 2.2423, "step": 340 }, { "epoch": 0.06, "grad_norm": 0.197265625, "learning_rate": 5.842506350550381e-05, "loss": 2.2573, "step": 345 }, { "epoch": 0.06, "grad_norm": 0.2099609375, "learning_rate": 5.927180355630821e-05, "loss": 2.2623, "step": 350 }, { "epoch": 0.06, "grad_norm": 0.1953125, "learning_rate": 6.011854360711262e-05, "loss": 2.2224, "step": 355 }, { "epoch": 0.06, "grad_norm": 0.2001953125, "learning_rate": 6.096528365791703e-05, "loss": 2.227, "step": 360 }, { "epoch": 0.06, "grad_norm": 0.1982421875, "learning_rate": 6.181202370872143e-05, "loss": 2.2246, "step": 365 }, { "epoch": 0.06, "grad_norm": 0.1962890625, "learning_rate": 6.265876375952583e-05, "loss": 2.2499, "step": 370 }, { "epoch": 0.06, "grad_norm": 0.2001953125, "learning_rate": 6.350550381033024e-05, "loss": 2.2423, "step": 375 }, { "epoch": 0.06, "grad_norm": 0.1962890625, "learning_rate": 6.435224386113464e-05, "loss": 2.2377, "step": 380 }, { "epoch": 0.07, "grad_norm": 0.2060546875, "learning_rate": 6.519898391193903e-05, "loss": 2.2515, "step": 385 }, { "epoch": 0.07, "grad_norm": 0.2021484375, "learning_rate": 6.604572396274344e-05, "loss": 2.2243, "step": 390 }, { "epoch": 0.07, "grad_norm": 0.197265625, "learning_rate": 6.689246401354784e-05, "loss": 2.2758, "step": 395 }, { "epoch": 0.07, "grad_norm": 0.2099609375, "learning_rate": 6.773920406435225e-05, "loss": 2.2412, "step": 400 }, { "epoch": 0.07, "grad_norm": 0.1982421875, "learning_rate": 6.858594411515665e-05, "loss": 2.2574, "step": 405 }, { "epoch": 0.07, "grad_norm": 0.1943359375, "learning_rate": 6.943268416596105e-05, "loss": 2.2604, "step": 410 }, { "epoch": 0.07, "grad_norm": 0.2265625, "learning_rate": 7.027942421676546e-05, "loss": 2.2502, "step": 415 }, { "epoch": 0.07, "grad_norm": 0.1962890625, "learning_rate": 7.112616426756987e-05, "loss": 2.2565, "step": 420 }, { "epoch": 0.07, "grad_norm": 0.197265625, "learning_rate": 7.197290431837426e-05, "loss": 2.2028, "step": 425 }, { "epoch": 0.07, "grad_norm": 0.2001953125, "learning_rate": 7.281964436917867e-05, "loss": 2.2372, "step": 430 }, { "epoch": 0.07, "grad_norm": 0.2080078125, "learning_rate": 7.366638441998307e-05, "loss": 2.2608, "step": 435 }, { "epoch": 0.07, "grad_norm": 0.205078125, "learning_rate": 7.451312447078747e-05, "loss": 2.2387, "step": 440 }, { "epoch": 0.08, "grad_norm": 0.212890625, "learning_rate": 7.535986452159187e-05, "loss": 2.2552, "step": 445 }, { "epoch": 0.08, "grad_norm": 0.205078125, "learning_rate": 7.620660457239628e-05, "loss": 2.183, "step": 450 }, { "epoch": 0.08, "grad_norm": 0.220703125, "learning_rate": 7.705334462320069e-05, "loss": 2.2255, "step": 455 }, { "epoch": 0.08, "grad_norm": 0.2001953125, "learning_rate": 7.790008467400508e-05, "loss": 2.2245, "step": 460 }, { "epoch": 0.08, "grad_norm": 0.2275390625, "learning_rate": 7.874682472480949e-05, "loss": 2.2154, "step": 465 }, { "epoch": 0.08, "grad_norm": 0.203125, "learning_rate": 7.95935647756139e-05, "loss": 2.2057, "step": 470 }, { "epoch": 0.08, "grad_norm": 0.2001953125, "learning_rate": 8.04403048264183e-05, "loss": 2.2309, "step": 475 }, { "epoch": 0.08, "grad_norm": 0.2041015625, "learning_rate": 8.12870448772227e-05, "loss": 2.2176, "step": 480 }, { "epoch": 0.08, "grad_norm": 0.2060546875, "learning_rate": 8.21337849280271e-05, "loss": 2.2527, "step": 485 }, { "epoch": 0.08, "grad_norm": 0.205078125, "learning_rate": 8.298052497883149e-05, "loss": 2.2488, "step": 490 }, { "epoch": 0.08, "grad_norm": 0.21484375, "learning_rate": 8.38272650296359e-05, "loss": 2.2384, "step": 495 }, { "epoch": 0.08, "grad_norm": 0.203125, "learning_rate": 8.467400508044031e-05, "loss": 2.246, "step": 500 }, { "epoch": 0.09, "grad_norm": 0.2060546875, "learning_rate": 8.552074513124472e-05, "loss": 2.2167, "step": 505 }, { "epoch": 0.09, "grad_norm": 0.2109375, "learning_rate": 8.636748518204911e-05, "loss": 2.2183, "step": 510 }, { "epoch": 0.09, "grad_norm": 0.208984375, "learning_rate": 8.721422523285352e-05, "loss": 2.2027, "step": 515 }, { "epoch": 0.09, "grad_norm": 0.251953125, "learning_rate": 8.806096528365793e-05, "loss": 2.1894, "step": 520 }, { "epoch": 0.09, "grad_norm": 0.2080078125, "learning_rate": 8.890770533446233e-05, "loss": 2.2201, "step": 525 }, { "epoch": 0.09, "grad_norm": 0.2138671875, "learning_rate": 8.975444538526672e-05, "loss": 2.2116, "step": 530 }, { "epoch": 0.09, "grad_norm": 0.2041015625, "learning_rate": 9.060118543607113e-05, "loss": 2.2211, "step": 535 }, { "epoch": 0.09, "grad_norm": 0.203125, "learning_rate": 9.144792548687552e-05, "loss": 2.2107, "step": 540 }, { "epoch": 0.09, "grad_norm": 0.2041015625, "learning_rate": 9.229466553767993e-05, "loss": 2.2446, "step": 545 }, { "epoch": 0.09, "grad_norm": 0.2080078125, "learning_rate": 9.314140558848434e-05, "loss": 2.2244, "step": 550 }, { "epoch": 0.09, "grad_norm": 0.208984375, "learning_rate": 9.398814563928874e-05, "loss": 2.2381, "step": 555 }, { "epoch": 0.09, "grad_norm": 0.2099609375, "learning_rate": 9.483488569009315e-05, "loss": 2.2091, "step": 560 }, { "epoch": 0.1, "grad_norm": 0.2236328125, "learning_rate": 9.568162574089755e-05, "loss": 2.2132, "step": 565 }, { "epoch": 0.1, "grad_norm": 0.205078125, "learning_rate": 9.652836579170195e-05, "loss": 2.2323, "step": 570 }, { "epoch": 0.1, "grad_norm": 0.2138671875, "learning_rate": 9.737510584250636e-05, "loss": 2.2082, "step": 575 }, { "epoch": 0.1, "grad_norm": 0.2099609375, "learning_rate": 9.822184589331075e-05, "loss": 2.2041, "step": 580 }, { "epoch": 0.1, "grad_norm": 0.2138671875, "learning_rate": 9.906858594411516e-05, "loss": 2.2193, "step": 585 }, { "epoch": 0.1, "grad_norm": 0.203125, "learning_rate": 9.991532599491956e-05, "loss": 2.2349, "step": 590 }, { "epoch": 0.1, "grad_norm": 0.201171875, "learning_rate": 0.00010076206604572395, "loss": 2.2278, "step": 595 }, { "epoch": 0.1, "grad_norm": 0.2138671875, "learning_rate": 0.00010160880609652836, "loss": 2.2127, "step": 600 }, { "epoch": 0.1, "grad_norm": 0.21875, "learning_rate": 0.00010245554614733277, "loss": 2.2367, "step": 605 }, { "epoch": 0.1, "grad_norm": 0.2119140625, "learning_rate": 0.00010330228619813718, "loss": 2.2172, "step": 610 }, { "epoch": 0.1, "grad_norm": 0.2197265625, "learning_rate": 0.00010414902624894159, "loss": 2.222, "step": 615 }, { "epoch": 0.11, "grad_norm": 0.1982421875, "learning_rate": 0.000104995766299746, "loss": 2.1955, "step": 620 }, { "epoch": 0.11, "grad_norm": 0.203125, "learning_rate": 0.00010584250635055039, "loss": 2.2568, "step": 625 }, { "epoch": 0.11, "grad_norm": 0.21484375, "learning_rate": 0.00010668924640135479, "loss": 2.2051, "step": 630 }, { "epoch": 0.11, "grad_norm": 0.2158203125, "learning_rate": 0.00010753598645215918, "loss": 2.2461, "step": 635 }, { "epoch": 0.11, "grad_norm": 0.208984375, "learning_rate": 0.00010838272650296359, "loss": 2.2322, "step": 640 }, { "epoch": 0.11, "grad_norm": 0.2138671875, "learning_rate": 0.000109229466553768, "loss": 2.2358, "step": 645 }, { "epoch": 0.11, "grad_norm": 0.2001953125, "learning_rate": 0.00011007620660457241, "loss": 2.1646, "step": 650 }, { "epoch": 0.11, "grad_norm": 0.2216796875, "learning_rate": 0.00011092294665537682, "loss": 2.2039, "step": 655 }, { "epoch": 0.11, "grad_norm": 0.2060546875, "learning_rate": 0.0001117696867061812, "loss": 2.1748, "step": 660 }, { "epoch": 0.11, "grad_norm": 0.205078125, "learning_rate": 0.0001126164267569856, "loss": 2.249, "step": 665 }, { "epoch": 0.11, "grad_norm": 0.201171875, "learning_rate": 0.00011346316680779001, "loss": 2.2732, "step": 670 }, { "epoch": 0.11, "grad_norm": 0.2109375, "learning_rate": 0.00011430990685859442, "loss": 2.2496, "step": 675 }, { "epoch": 0.12, "grad_norm": 0.201171875, "learning_rate": 0.00011515664690939882, "loss": 2.2412, "step": 680 }, { "epoch": 0.12, "grad_norm": 0.1982421875, "learning_rate": 0.00011600338696020323, "loss": 2.1998, "step": 685 }, { "epoch": 0.12, "grad_norm": 0.2099609375, "learning_rate": 0.00011685012701100762, "loss": 2.202, "step": 690 }, { "epoch": 0.12, "grad_norm": 0.2001953125, "learning_rate": 0.00011769686706181202, "loss": 2.208, "step": 695 }, { "epoch": 0.12, "grad_norm": 0.2021484375, "learning_rate": 0.00011854360711261643, "loss": 2.212, "step": 700 }, { "epoch": 0.12, "grad_norm": 0.2041015625, "learning_rate": 0.00011939034716342083, "loss": 2.2409, "step": 705 }, { "epoch": 0.12, "grad_norm": 0.19921875, "learning_rate": 0.00012023708721422524, "loss": 2.2069, "step": 710 }, { "epoch": 0.12, "grad_norm": 0.2197265625, "learning_rate": 0.00012108382726502965, "loss": 2.2205, "step": 715 }, { "epoch": 0.12, "grad_norm": 0.1982421875, "learning_rate": 0.00012193056731583406, "loss": 2.2525, "step": 720 }, { "epoch": 0.12, "grad_norm": 0.20703125, "learning_rate": 0.00012277730736663843, "loss": 2.2261, "step": 725 }, { "epoch": 0.12, "grad_norm": 0.1962890625, "learning_rate": 0.00012362404741744285, "loss": 2.1868, "step": 730 }, { "epoch": 0.12, "grad_norm": 0.2041015625, "learning_rate": 0.00012447078746824725, "loss": 2.2363, "step": 735 }, { "epoch": 0.13, "grad_norm": 0.220703125, "learning_rate": 0.00012531752751905167, "loss": 2.1955, "step": 740 }, { "epoch": 0.13, "grad_norm": 0.2021484375, "learning_rate": 0.00012616426756985606, "loss": 2.1964, "step": 745 }, { "epoch": 0.13, "grad_norm": 0.1982421875, "learning_rate": 0.00012701100762066049, "loss": 2.2456, "step": 750 }, { "epoch": 0.13, "grad_norm": 0.2041015625, "learning_rate": 0.00012785774767146485, "loss": 2.1887, "step": 755 }, { "epoch": 0.13, "grad_norm": 0.21484375, "learning_rate": 0.00012870448772226928, "loss": 2.2023, "step": 760 }, { "epoch": 0.13, "grad_norm": 0.197265625, "learning_rate": 0.00012955122777307367, "loss": 2.21, "step": 765 }, { "epoch": 0.13, "grad_norm": 0.20703125, "learning_rate": 0.00013039796782387807, "loss": 2.1852, "step": 770 }, { "epoch": 0.13, "grad_norm": 0.2021484375, "learning_rate": 0.0001312447078746825, "loss": 2.1978, "step": 775 }, { "epoch": 0.13, "grad_norm": 0.197265625, "learning_rate": 0.00013209144792548688, "loss": 2.2205, "step": 780 }, { "epoch": 0.13, "grad_norm": 0.2099609375, "learning_rate": 0.0001329381879762913, "loss": 2.179, "step": 785 }, { "epoch": 0.13, "grad_norm": 0.203125, "learning_rate": 0.00013378492802709567, "loss": 2.2183, "step": 790 }, { "epoch": 0.13, "grad_norm": 0.203125, "learning_rate": 0.0001346316680779001, "loss": 2.1874, "step": 795 }, { "epoch": 0.14, "grad_norm": 0.2021484375, "learning_rate": 0.0001354784081287045, "loss": 2.2123, "step": 800 }, { "epoch": 0.14, "grad_norm": 0.205078125, "learning_rate": 0.0001363251481795089, "loss": 2.2362, "step": 805 }, { "epoch": 0.14, "grad_norm": 0.201171875, "learning_rate": 0.0001371718882303133, "loss": 2.2316, "step": 810 }, { "epoch": 0.14, "grad_norm": 0.1982421875, "learning_rate": 0.0001380186282811177, "loss": 2.1873, "step": 815 }, { "epoch": 0.14, "grad_norm": 0.1982421875, "learning_rate": 0.0001388653683319221, "loss": 2.2417, "step": 820 }, { "epoch": 0.14, "grad_norm": 0.1953125, "learning_rate": 0.0001397121083827265, "loss": 2.1944, "step": 825 }, { "epoch": 0.14, "grad_norm": 0.1982421875, "learning_rate": 0.00014055884843353092, "loss": 2.1983, "step": 830 }, { "epoch": 0.14, "grad_norm": 0.1953125, "learning_rate": 0.0001414055884843353, "loss": 2.2246, "step": 835 }, { "epoch": 0.14, "grad_norm": 0.193359375, "learning_rate": 0.00014225232853513973, "loss": 2.2173, "step": 840 }, { "epoch": 0.14, "grad_norm": 0.201171875, "learning_rate": 0.00014309906858594413, "loss": 2.1354, "step": 845 }, { "epoch": 0.14, "grad_norm": 0.2021484375, "learning_rate": 0.00014394580863674852, "loss": 2.2328, "step": 850 }, { "epoch": 0.14, "grad_norm": 0.1953125, "learning_rate": 0.00014479254868755292, "loss": 2.215, "step": 855 }, { "epoch": 0.15, "grad_norm": 0.2060546875, "learning_rate": 0.00014563928873835734, "loss": 2.214, "step": 860 }, { "epoch": 0.15, "grad_norm": 0.212890625, "learning_rate": 0.00014648602878916173, "loss": 2.1958, "step": 865 }, { "epoch": 0.15, "grad_norm": 0.2041015625, "learning_rate": 0.00014733276883996613, "loss": 2.1809, "step": 870 }, { "epoch": 0.15, "grad_norm": 0.2001953125, "learning_rate": 0.00014817950889077055, "loss": 2.264, "step": 875 }, { "epoch": 0.15, "grad_norm": 0.19921875, "learning_rate": 0.00014902624894157495, "loss": 2.2147, "step": 880 }, { "epoch": 0.15, "grad_norm": 0.193359375, "learning_rate": 0.00014987298899237934, "loss": 2.1813, "step": 885 }, { "epoch": 0.15, "grad_norm": 0.2001953125, "learning_rate": 0.00015071972904318374, "loss": 2.1541, "step": 890 }, { "epoch": 0.15, "grad_norm": 0.205078125, "learning_rate": 0.00015156646909398816, "loss": 2.1661, "step": 895 }, { "epoch": 0.15, "grad_norm": 0.2060546875, "learning_rate": 0.00015241320914479255, "loss": 2.2126, "step": 900 }, { "epoch": 0.15, "grad_norm": 0.19921875, "learning_rate": 0.00015325994919559695, "loss": 2.192, "step": 905 }, { "epoch": 0.15, "grad_norm": 0.1982421875, "learning_rate": 0.00015410668924640137, "loss": 2.2546, "step": 910 }, { "epoch": 0.15, "grad_norm": 0.197265625, "learning_rate": 0.00015495342929720577, "loss": 2.1839, "step": 915 }, { "epoch": 0.16, "grad_norm": 0.2041015625, "learning_rate": 0.00015580016934801016, "loss": 2.2013, "step": 920 }, { "epoch": 0.16, "grad_norm": 0.19140625, "learning_rate": 0.00015664690939881456, "loss": 2.2374, "step": 925 }, { "epoch": 0.16, "grad_norm": 0.19921875, "learning_rate": 0.00015749364944961898, "loss": 2.1984, "step": 930 }, { "epoch": 0.16, "grad_norm": 0.19921875, "learning_rate": 0.00015834038950042337, "loss": 2.171, "step": 935 }, { "epoch": 0.16, "grad_norm": 0.19140625, "learning_rate": 0.0001591871295512278, "loss": 2.199, "step": 940 }, { "epoch": 0.16, "grad_norm": 0.2021484375, "learning_rate": 0.0001600338696020322, "loss": 2.2011, "step": 945 }, { "epoch": 0.16, "grad_norm": 0.2109375, "learning_rate": 0.0001608806096528366, "loss": 2.2099, "step": 950 }, { "epoch": 0.16, "grad_norm": 0.1982421875, "learning_rate": 0.00016172734970364098, "loss": 2.203, "step": 955 }, { "epoch": 0.16, "grad_norm": 0.197265625, "learning_rate": 0.0001625740897544454, "loss": 2.2233, "step": 960 }, { "epoch": 0.16, "grad_norm": 0.19921875, "learning_rate": 0.0001634208298052498, "loss": 2.2154, "step": 965 }, { "epoch": 0.16, "grad_norm": 0.197265625, "learning_rate": 0.0001642675698560542, "loss": 2.1916, "step": 970 }, { "epoch": 0.17, "grad_norm": 0.193359375, "learning_rate": 0.00016511430990685862, "loss": 2.2565, "step": 975 }, { "epoch": 0.17, "grad_norm": 0.1875, "learning_rate": 0.00016596104995766298, "loss": 2.2449, "step": 980 }, { "epoch": 0.17, "grad_norm": 0.19140625, "learning_rate": 0.0001668077900084674, "loss": 2.2073, "step": 985 }, { "epoch": 0.17, "grad_norm": 0.1884765625, "learning_rate": 0.0001676545300592718, "loss": 2.2011, "step": 990 }, { "epoch": 0.17, "grad_norm": 0.193359375, "learning_rate": 0.00016850127011007622, "loss": 2.2025, "step": 995 }, { "epoch": 0.17, "grad_norm": 0.1865234375, "learning_rate": 0.00016934801016088062, "loss": 2.2179, "step": 1000 }, { "epoch": 0.17, "grad_norm": 0.197265625, "learning_rate": 0.00017019475021168501, "loss": 2.2083, "step": 1005 }, { "epoch": 0.17, "grad_norm": 0.1865234375, "learning_rate": 0.00017104149026248944, "loss": 2.1914, "step": 1010 }, { "epoch": 0.17, "grad_norm": 0.1962890625, "learning_rate": 0.00017188823031329383, "loss": 2.2375, "step": 1015 }, { "epoch": 0.17, "grad_norm": 0.193359375, "learning_rate": 0.00017273497036409823, "loss": 2.2289, "step": 1020 }, { "epoch": 0.17, "grad_norm": 0.1904296875, "learning_rate": 0.00017358171041490262, "loss": 2.2088, "step": 1025 }, { "epoch": 0.17, "grad_norm": 0.193359375, "learning_rate": 0.00017442845046570704, "loss": 2.1675, "step": 1030 }, { "epoch": 0.18, "grad_norm": 0.2001953125, "learning_rate": 0.00017527519051651144, "loss": 2.2139, "step": 1035 }, { "epoch": 0.18, "grad_norm": 0.1845703125, "learning_rate": 0.00017612193056731586, "loss": 2.1872, "step": 1040 }, { "epoch": 0.18, "grad_norm": 0.189453125, "learning_rate": 0.00017696867061812023, "loss": 2.1982, "step": 1045 }, { "epoch": 0.18, "grad_norm": 0.1953125, "learning_rate": 0.00017781541066892465, "loss": 2.2075, "step": 1050 }, { "epoch": 0.18, "grad_norm": 0.19921875, "learning_rate": 0.00017866215071972905, "loss": 2.2447, "step": 1055 }, { "epoch": 0.18, "grad_norm": 0.193359375, "learning_rate": 0.00017950889077053344, "loss": 2.1948, "step": 1060 }, { "epoch": 0.18, "grad_norm": 0.205078125, "learning_rate": 0.00018035563082133786, "loss": 2.2087, "step": 1065 }, { "epoch": 0.18, "grad_norm": 0.20703125, "learning_rate": 0.00018120237087214226, "loss": 2.2168, "step": 1070 }, { "epoch": 0.18, "grad_norm": 0.2080078125, "learning_rate": 0.00018204911092294668, "loss": 2.2122, "step": 1075 }, { "epoch": 0.18, "grad_norm": 0.189453125, "learning_rate": 0.00018289585097375105, "loss": 2.2347, "step": 1080 }, { "epoch": 0.18, "grad_norm": 0.1962890625, "learning_rate": 0.00018374259102455547, "loss": 2.2307, "step": 1085 }, { "epoch": 0.18, "grad_norm": 0.18359375, "learning_rate": 0.00018458933107535987, "loss": 2.1836, "step": 1090 }, { "epoch": 0.19, "grad_norm": 0.1953125, "learning_rate": 0.0001854360711261643, "loss": 2.1929, "step": 1095 }, { "epoch": 0.19, "grad_norm": 0.189453125, "learning_rate": 0.00018628281117696868, "loss": 2.1908, "step": 1100 }, { "epoch": 0.19, "grad_norm": 0.18359375, "learning_rate": 0.00018712955122777308, "loss": 2.2123, "step": 1105 }, { "epoch": 0.19, "grad_norm": 0.1875, "learning_rate": 0.00018797629127857747, "loss": 2.1282, "step": 1110 }, { "epoch": 0.19, "grad_norm": 0.1953125, "learning_rate": 0.0001888230313293819, "loss": 2.1387, "step": 1115 }, { "epoch": 0.19, "grad_norm": 0.193359375, "learning_rate": 0.0001896697713801863, "loss": 2.2153, "step": 1120 }, { "epoch": 0.19, "grad_norm": 0.1875, "learning_rate": 0.0001905165114309907, "loss": 2.1932, "step": 1125 }, { "epoch": 0.19, "grad_norm": 0.1923828125, "learning_rate": 0.0001913632514817951, "loss": 2.1695, "step": 1130 }, { "epoch": 0.19, "grad_norm": 0.1865234375, "learning_rate": 0.0001922099915325995, "loss": 2.1855, "step": 1135 }, { "epoch": 0.19, "grad_norm": 0.1962890625, "learning_rate": 0.0001930567315834039, "loss": 2.2053, "step": 1140 }, { "epoch": 0.19, "grad_norm": 0.2021484375, "learning_rate": 0.0001939034716342083, "loss": 2.2082, "step": 1145 }, { "epoch": 0.19, "grad_norm": 0.1962890625, "learning_rate": 0.00019475021168501272, "loss": 2.1713, "step": 1150 }, { "epoch": 0.2, "grad_norm": 0.1865234375, "learning_rate": 0.0001955969517358171, "loss": 2.1904, "step": 1155 }, { "epoch": 0.2, "grad_norm": 0.1953125, "learning_rate": 0.0001964436917866215, "loss": 2.235, "step": 1160 }, { "epoch": 0.2, "grad_norm": 0.1982421875, "learning_rate": 0.00019729043183742593, "loss": 2.1879, "step": 1165 }, { "epoch": 0.2, "grad_norm": 0.1884765625, "learning_rate": 0.00019813717188823032, "loss": 2.2022, "step": 1170 }, { "epoch": 0.2, "grad_norm": 0.1953125, "learning_rate": 0.00019898391193903472, "loss": 2.1715, "step": 1175 }, { "epoch": 0.2, "grad_norm": 0.189453125, "learning_rate": 0.00019983065198983911, "loss": 2.2363, "step": 1180 }, { "epoch": 0.2, "grad_norm": 0.197265625, "learning_rate": 0.00019999993008532863, "loss": 2.1869, "step": 1185 }, { "epoch": 0.2, "grad_norm": 0.2001953125, "learning_rate": 0.00019999964605714373, "loss": 2.1716, "step": 1190 }, { "epoch": 0.2, "grad_norm": 0.1923828125, "learning_rate": 0.00019999914354639845, "loss": 2.1728, "step": 1195 }, { "epoch": 0.2, "grad_norm": 0.1845703125, "learning_rate": 0.00019999842255419064, "loss": 2.1702, "step": 1200 }, { "epoch": 0.2, "grad_norm": 0.1845703125, "learning_rate": 0.0001999974830820956, "loss": 2.1999, "step": 1205 }, { "epoch": 0.2, "grad_norm": 0.193359375, "learning_rate": 0.00019999632513216587, "loss": 2.2017, "step": 1210 }, { "epoch": 0.21, "grad_norm": 0.1865234375, "learning_rate": 0.00019999494870693142, "loss": 2.1771, "step": 1215 }, { "epoch": 0.21, "grad_norm": 0.1875, "learning_rate": 0.00019999335380939948, "loss": 2.2625, "step": 1220 }, { "epoch": 0.21, "grad_norm": 0.1875, "learning_rate": 0.00019999154044305465, "loss": 2.1442, "step": 1225 }, { "epoch": 0.21, "grad_norm": 0.1875, "learning_rate": 0.00019998950861185885, "loss": 2.1751, "step": 1230 }, { "epoch": 0.21, "grad_norm": 0.185546875, "learning_rate": 0.00019998725832025125, "loss": 2.1923, "step": 1235 }, { "epoch": 0.21, "grad_norm": 0.1904296875, "learning_rate": 0.0001999847895731484, "loss": 2.1393, "step": 1240 }, { "epoch": 0.21, "grad_norm": 0.1875, "learning_rate": 0.0001999821023759441, "loss": 2.1904, "step": 1245 }, { "epoch": 0.21, "grad_norm": 0.1865234375, "learning_rate": 0.00019997919673450938, "loss": 2.1866, "step": 1250 }, { "epoch": 0.21, "grad_norm": 0.19140625, "learning_rate": 0.00019997607265519264, "loss": 2.1876, "step": 1255 }, { "epoch": 0.21, "grad_norm": 0.1923828125, "learning_rate": 0.00019997273014481942, "loss": 2.1773, "step": 1260 }, { "epoch": 0.21, "grad_norm": 0.185546875, "learning_rate": 0.0001999691692106926, "loss": 2.2087, "step": 1265 }, { "epoch": 0.22, "grad_norm": 0.1826171875, "learning_rate": 0.00019996538986059221, "loss": 2.1703, "step": 1270 }, { "epoch": 0.22, "grad_norm": 0.1904296875, "learning_rate": 0.0001999613921027755, "loss": 2.1993, "step": 1275 }, { "epoch": 0.22, "grad_norm": 0.189453125, "learning_rate": 0.0001999571759459769, "loss": 2.1809, "step": 1280 }, { "epoch": 0.22, "grad_norm": 0.1875, "learning_rate": 0.000199952741399408, "loss": 2.1823, "step": 1285 }, { "epoch": 0.22, "grad_norm": 0.18359375, "learning_rate": 0.00019994808847275755, "loss": 2.1903, "step": 1290 }, { "epoch": 0.22, "grad_norm": 0.185546875, "learning_rate": 0.00019994321717619143, "loss": 2.1749, "step": 1295 }, { "epoch": 0.22, "grad_norm": 0.1845703125, "learning_rate": 0.0001999381275203526, "loss": 2.1858, "step": 1300 }, { "epoch": 0.22, "grad_norm": 0.185546875, "learning_rate": 0.00019993281951636113, "loss": 2.1789, "step": 1305 }, { "epoch": 0.22, "grad_norm": 0.1943359375, "learning_rate": 0.00019992729317581408, "loss": 2.1869, "step": 1310 }, { "epoch": 0.22, "grad_norm": 0.1943359375, "learning_rate": 0.00019992154851078563, "loss": 2.2076, "step": 1315 }, { "epoch": 0.22, "grad_norm": 0.1865234375, "learning_rate": 0.0001999155855338269, "loss": 2.2026, "step": 1320 }, { "epoch": 0.22, "grad_norm": 0.193359375, "learning_rate": 0.00019990940425796604, "loss": 2.1573, "step": 1325 }, { "epoch": 0.23, "grad_norm": 0.1865234375, "learning_rate": 0.000199903004696708, "loss": 2.1708, "step": 1330 }, { "epoch": 0.23, "grad_norm": 0.1865234375, "learning_rate": 0.00019989638686403484, "loss": 2.2057, "step": 1335 }, { "epoch": 0.23, "grad_norm": 0.1884765625, "learning_rate": 0.0001998895507744054, "loss": 2.201, "step": 1340 }, { "epoch": 0.23, "grad_norm": 0.1884765625, "learning_rate": 0.00019988249644275542, "loss": 2.2209, "step": 1345 }, { "epoch": 0.23, "grad_norm": 0.1796875, "learning_rate": 0.0001998752238844974, "loss": 2.2228, "step": 1350 }, { "epoch": 0.23, "grad_norm": 0.189453125, "learning_rate": 0.00019986773311552069, "loss": 2.1913, "step": 1355 }, { "epoch": 0.23, "grad_norm": 0.1923828125, "learning_rate": 0.00019986002415219137, "loss": 2.1614, "step": 1360 }, { "epoch": 0.23, "grad_norm": 0.1875, "learning_rate": 0.00019985209701135222, "loss": 2.1918, "step": 1365 }, { "epoch": 0.23, "grad_norm": 0.19140625, "learning_rate": 0.00019984395171032278, "loss": 2.1789, "step": 1370 }, { "epoch": 0.23, "grad_norm": 0.193359375, "learning_rate": 0.0001998355882668991, "loss": 2.1664, "step": 1375 }, { "epoch": 0.23, "grad_norm": 0.19921875, "learning_rate": 0.00019982700669935396, "loss": 2.2059, "step": 1380 }, { "epoch": 0.23, "grad_norm": 0.185546875, "learning_rate": 0.00019981820702643662, "loss": 2.1638, "step": 1385 }, { "epoch": 0.24, "grad_norm": 0.189453125, "learning_rate": 0.00019980918926737294, "loss": 2.2125, "step": 1390 }, { "epoch": 0.24, "grad_norm": 0.1904296875, "learning_rate": 0.0001997999534418652, "loss": 2.1583, "step": 1395 }, { "epoch": 0.24, "grad_norm": 0.1806640625, "learning_rate": 0.00019979049957009212, "loss": 2.1899, "step": 1400 }, { "epoch": 0.24, "grad_norm": 0.1845703125, "learning_rate": 0.00019978082767270884, "loss": 2.2027, "step": 1405 }, { "epoch": 0.24, "grad_norm": 0.18359375, "learning_rate": 0.0001997709377708469, "loss": 2.2096, "step": 1410 }, { "epoch": 0.24, "grad_norm": 0.1845703125, "learning_rate": 0.000199760829886114, "loss": 2.1597, "step": 1415 }, { "epoch": 0.24, "grad_norm": 0.1953125, "learning_rate": 0.00019975050404059426, "loss": 2.1986, "step": 1420 }, { "epoch": 0.24, "grad_norm": 0.193359375, "learning_rate": 0.00019973996025684788, "loss": 2.2003, "step": 1425 }, { "epoch": 0.24, "grad_norm": 0.189453125, "learning_rate": 0.00019972919855791132, "loss": 2.1415, "step": 1430 }, { "epoch": 0.24, "grad_norm": 0.1865234375, "learning_rate": 0.00019971821896729703, "loss": 2.1862, "step": 1435 }, { "epoch": 0.24, "grad_norm": 0.1875, "learning_rate": 0.00019970702150899365, "loss": 2.1944, "step": 1440 }, { "epoch": 0.24, "grad_norm": 0.193359375, "learning_rate": 0.00019969560620746571, "loss": 2.2099, "step": 1445 }, { "epoch": 0.25, "grad_norm": 0.1865234375, "learning_rate": 0.00019968397308765375, "loss": 2.2194, "step": 1450 }, { "epoch": 0.25, "grad_norm": 0.1875, "learning_rate": 0.00019967212217497426, "loss": 2.2112, "step": 1455 }, { "epoch": 0.25, "grad_norm": 0.1884765625, "learning_rate": 0.00019966005349531942, "loss": 2.1745, "step": 1460 }, { "epoch": 0.25, "grad_norm": 0.1904296875, "learning_rate": 0.00019964776707505734, "loss": 2.1624, "step": 1465 }, { "epoch": 0.25, "grad_norm": 0.1904296875, "learning_rate": 0.0001996352629410318, "loss": 2.1977, "step": 1470 }, { "epoch": 0.25, "grad_norm": 0.181640625, "learning_rate": 0.00019962254112056223, "loss": 2.2192, "step": 1475 }, { "epoch": 0.25, "grad_norm": 0.189453125, "learning_rate": 0.00019960960164144368, "loss": 2.1652, "step": 1480 }, { "epoch": 0.25, "grad_norm": 0.1982421875, "learning_rate": 0.00019959644453194678, "loss": 2.1841, "step": 1485 }, { "epoch": 0.25, "grad_norm": 0.19140625, "learning_rate": 0.00019958306982081761, "loss": 2.2137, "step": 1490 }, { "epoch": 0.25, "grad_norm": 0.18359375, "learning_rate": 0.00019956947753727765, "loss": 2.1878, "step": 1495 }, { "epoch": 0.25, "grad_norm": 0.1884765625, "learning_rate": 0.00019955566771102384, "loss": 2.148, "step": 1500 }, { "epoch": 0.25, "grad_norm": 0.1923828125, "learning_rate": 0.0001995416403722283, "loss": 2.2543, "step": 1505 }, { "epoch": 0.26, "grad_norm": 0.1865234375, "learning_rate": 0.00019952739555153848, "loss": 2.1969, "step": 1510 }, { "epoch": 0.26, "grad_norm": 0.1962890625, "learning_rate": 0.0001995129332800769, "loss": 2.2019, "step": 1515 }, { "epoch": 0.26, "grad_norm": 0.1943359375, "learning_rate": 0.00019949825358944113, "loss": 2.1805, "step": 1520 }, { "epoch": 0.26, "grad_norm": 0.19140625, "learning_rate": 0.00019948335651170403, "loss": 2.1349, "step": 1525 }, { "epoch": 0.26, "grad_norm": 0.193359375, "learning_rate": 0.00019946824207941308, "loss": 2.1884, "step": 1530 }, { "epoch": 0.26, "grad_norm": 0.19140625, "learning_rate": 0.00019945291032559087, "loss": 2.1758, "step": 1535 }, { "epoch": 0.26, "grad_norm": 0.1904296875, "learning_rate": 0.0001994373612837347, "loss": 2.2044, "step": 1540 }, { "epoch": 0.26, "grad_norm": 0.18359375, "learning_rate": 0.00019942159498781667, "loss": 2.1701, "step": 1545 }, { "epoch": 0.26, "grad_norm": 0.1923828125, "learning_rate": 0.00019940561147228347, "loss": 2.1771, "step": 1550 }, { "epoch": 0.26, "grad_norm": 0.1962890625, "learning_rate": 0.0001993894107720564, "loss": 2.1836, "step": 1555 }, { "epoch": 0.26, "grad_norm": 0.1865234375, "learning_rate": 0.00019937299292253137, "loss": 2.1649, "step": 1560 }, { "epoch": 0.27, "grad_norm": 0.1884765625, "learning_rate": 0.00019935635795957857, "loss": 2.1816, "step": 1565 }, { "epoch": 0.27, "grad_norm": 0.193359375, "learning_rate": 0.00019933950591954265, "loss": 2.189, "step": 1570 }, { "epoch": 0.27, "grad_norm": 0.1943359375, "learning_rate": 0.0001993224368392425, "loss": 2.2155, "step": 1575 }, { "epoch": 0.27, "grad_norm": 0.1845703125, "learning_rate": 0.00019930515075597123, "loss": 2.1719, "step": 1580 }, { "epoch": 0.27, "grad_norm": 0.185546875, "learning_rate": 0.00019928764770749604, "loss": 2.1808, "step": 1585 }, { "epoch": 0.27, "grad_norm": 0.1904296875, "learning_rate": 0.00019926992773205816, "loss": 2.1824, "step": 1590 }, { "epoch": 0.27, "grad_norm": 0.1845703125, "learning_rate": 0.00019925199086837282, "loss": 2.1842, "step": 1595 }, { "epoch": 0.27, "grad_norm": 0.1923828125, "learning_rate": 0.00019923383715562902, "loss": 2.1908, "step": 1600 }, { "epoch": 0.27, "grad_norm": 0.18359375, "learning_rate": 0.00019921546663348964, "loss": 2.2098, "step": 1605 }, { "epoch": 0.27, "grad_norm": 0.19140625, "learning_rate": 0.00019919687934209123, "loss": 2.1821, "step": 1610 }, { "epoch": 0.27, "grad_norm": 0.1904296875, "learning_rate": 0.0001991780753220439, "loss": 2.1931, "step": 1615 }, { "epoch": 0.27, "grad_norm": 0.2001953125, "learning_rate": 0.00019915905461443125, "loss": 2.2284, "step": 1620 }, { "epoch": 0.28, "grad_norm": 0.1943359375, "learning_rate": 0.00019913981726081046, "loss": 2.1604, "step": 1625 }, { "epoch": 0.28, "grad_norm": 0.1826171875, "learning_rate": 0.00019912036330321185, "loss": 2.2391, "step": 1630 }, { "epoch": 0.28, "grad_norm": 0.185546875, "learning_rate": 0.0001991006927841391, "loss": 2.1986, "step": 1635 }, { "epoch": 0.28, "grad_norm": 0.19140625, "learning_rate": 0.00019908080574656905, "loss": 2.2385, "step": 1640 }, { "epoch": 0.28, "grad_norm": 0.1845703125, "learning_rate": 0.00019906070223395153, "loss": 2.1974, "step": 1645 }, { "epoch": 0.28, "grad_norm": 0.1884765625, "learning_rate": 0.00019904038229020935, "loss": 2.1889, "step": 1650 }, { "epoch": 0.28, "grad_norm": 0.189453125, "learning_rate": 0.00019901984595973823, "loss": 2.1733, "step": 1655 }, { "epoch": 0.28, "grad_norm": 0.1884765625, "learning_rate": 0.00019899909328740666, "loss": 2.1783, "step": 1660 }, { "epoch": 0.28, "grad_norm": 0.189453125, "learning_rate": 0.00019897812431855569, "loss": 2.1863, "step": 1665 }, { "epoch": 0.28, "grad_norm": 0.197265625, "learning_rate": 0.00019895693909899908, "loss": 2.1418, "step": 1670 }, { "epoch": 0.28, "grad_norm": 0.1884765625, "learning_rate": 0.00019893553767502299, "loss": 2.1798, "step": 1675 }, { "epoch": 0.28, "grad_norm": 0.1884765625, "learning_rate": 0.00019891392009338597, "loss": 2.2185, "step": 1680 }, { "epoch": 0.29, "grad_norm": 0.185546875, "learning_rate": 0.0001988920864013188, "loss": 2.2217, "step": 1685 }, { "epoch": 0.29, "grad_norm": 0.189453125, "learning_rate": 0.00019887003664652452, "loss": 2.19, "step": 1690 }, { "epoch": 0.29, "grad_norm": 0.19140625, "learning_rate": 0.0001988477708771781, "loss": 2.2109, "step": 1695 }, { "epoch": 0.29, "grad_norm": 0.19921875, "learning_rate": 0.00019882528914192657, "loss": 2.1982, "step": 1700 }, { "epoch": 0.29, "grad_norm": 0.1875, "learning_rate": 0.0001988025914898888, "loss": 2.1367, "step": 1705 }, { "epoch": 0.29, "grad_norm": 0.1904296875, "learning_rate": 0.0001987796779706553, "loss": 2.1894, "step": 1710 }, { "epoch": 0.29, "grad_norm": 0.1962890625, "learning_rate": 0.00019875654863428838, "loss": 2.1371, "step": 1715 }, { "epoch": 0.29, "grad_norm": 0.197265625, "learning_rate": 0.00019873320353132174, "loss": 2.1592, "step": 1720 }, { "epoch": 0.29, "grad_norm": 0.19921875, "learning_rate": 0.00019870964271276055, "loss": 2.1695, "step": 1725 }, { "epoch": 0.29, "grad_norm": 0.18359375, "learning_rate": 0.00019868586623008125, "loss": 2.1658, "step": 1730 }, { "epoch": 0.29, "grad_norm": 0.1884765625, "learning_rate": 0.00019866187413523153, "loss": 2.1584, "step": 1735 }, { "epoch": 0.29, "grad_norm": 0.1923828125, "learning_rate": 0.00019863766648063006, "loss": 2.2071, "step": 1740 }, { "epoch": 0.3, "grad_norm": 0.19140625, "learning_rate": 0.00019861324331916662, "loss": 2.2012, "step": 1745 }, { "epoch": 0.3, "grad_norm": 0.193359375, "learning_rate": 0.00019858860470420167, "loss": 2.2062, "step": 1750 }, { "epoch": 0.3, "grad_norm": 0.1845703125, "learning_rate": 0.00019856375068956651, "loss": 2.1877, "step": 1755 }, { "epoch": 0.3, "grad_norm": 0.1923828125, "learning_rate": 0.000198538681329563, "loss": 2.1791, "step": 1760 }, { "epoch": 0.3, "grad_norm": 0.2021484375, "learning_rate": 0.00019851339667896354, "loss": 2.155, "step": 1765 }, { "epoch": 0.3, "grad_norm": 0.193359375, "learning_rate": 0.00019848789679301085, "loss": 2.1589, "step": 1770 }, { "epoch": 0.3, "grad_norm": 0.1904296875, "learning_rate": 0.00019846218172741794, "loss": 2.1752, "step": 1775 }, { "epoch": 0.3, "grad_norm": 0.185546875, "learning_rate": 0.00019843625153836798, "loss": 2.2145, "step": 1780 }, { "epoch": 0.3, "grad_norm": 0.1826171875, "learning_rate": 0.00019841010628251406, "loss": 2.166, "step": 1785 }, { "epoch": 0.3, "grad_norm": 0.1875, "learning_rate": 0.00019838374601697923, "loss": 2.2264, "step": 1790 }, { "epoch": 0.3, "grad_norm": 0.1943359375, "learning_rate": 0.00019835717079935624, "loss": 2.1749, "step": 1795 }, { "epoch": 0.3, "grad_norm": 0.1923828125, "learning_rate": 0.00019833038068770757, "loss": 2.1778, "step": 1800 }, { "epoch": 0.31, "grad_norm": 0.193359375, "learning_rate": 0.00019830337574056514, "loss": 2.1967, "step": 1805 }, { "epoch": 0.31, "grad_norm": 0.19140625, "learning_rate": 0.00019827615601693022, "loss": 2.1804, "step": 1810 }, { "epoch": 0.31, "grad_norm": 0.1884765625, "learning_rate": 0.00019824872157627339, "loss": 2.2043, "step": 1815 }, { "epoch": 0.31, "grad_norm": 0.1953125, "learning_rate": 0.00019822107247853435, "loss": 2.1591, "step": 1820 }, { "epoch": 0.31, "grad_norm": 0.193359375, "learning_rate": 0.00019819320878412174, "loss": 2.1763, "step": 1825 }, { "epoch": 0.31, "grad_norm": 0.1865234375, "learning_rate": 0.00019816513055391307, "loss": 2.1789, "step": 1830 }, { "epoch": 0.31, "grad_norm": 0.189453125, "learning_rate": 0.00019813683784925467, "loss": 2.2, "step": 1835 }, { "epoch": 0.31, "grad_norm": 0.1904296875, "learning_rate": 0.00019810833073196133, "loss": 2.1581, "step": 1840 }, { "epoch": 0.31, "grad_norm": 0.1904296875, "learning_rate": 0.00019807960926431634, "loss": 2.2085, "step": 1845 }, { "epoch": 0.31, "grad_norm": 0.1875, "learning_rate": 0.00019805067350907134, "loss": 2.1584, "step": 1850 }, { "epoch": 0.31, "grad_norm": 0.1943359375, "learning_rate": 0.00019802152352944616, "loss": 2.2049, "step": 1855 }, { "epoch": 0.32, "grad_norm": 0.1923828125, "learning_rate": 0.0001979921593891286, "loss": 2.1572, "step": 1860 }, { "epoch": 0.32, "grad_norm": 0.197265625, "learning_rate": 0.00019796258115227443, "loss": 2.2329, "step": 1865 }, { "epoch": 0.32, "grad_norm": 0.189453125, "learning_rate": 0.00019793278888350716, "loss": 2.1925, "step": 1870 }, { "epoch": 0.32, "grad_norm": 0.1943359375, "learning_rate": 0.00019790278264791795, "loss": 2.1534, "step": 1875 }, { "epoch": 0.32, "grad_norm": 0.1943359375, "learning_rate": 0.00019787256251106543, "loss": 2.1437, "step": 1880 }, { "epoch": 0.32, "grad_norm": 0.19140625, "learning_rate": 0.00019784212853897552, "loss": 2.193, "step": 1885 }, { "epoch": 0.32, "grad_norm": 0.1923828125, "learning_rate": 0.0001978114807981414, "loss": 2.1838, "step": 1890 }, { "epoch": 0.32, "grad_norm": 0.1884765625, "learning_rate": 0.0001977806193555233, "loss": 2.1925, "step": 1895 }, { "epoch": 0.32, "grad_norm": 0.189453125, "learning_rate": 0.00019774954427854833, "loss": 2.1709, "step": 1900 }, { "epoch": 0.32, "grad_norm": 0.1875, "learning_rate": 0.0001977182556351103, "loss": 2.1448, "step": 1905 }, { "epoch": 0.32, "grad_norm": 0.19921875, "learning_rate": 0.0001976867534935697, "loss": 2.2003, "step": 1910 }, { "epoch": 0.32, "grad_norm": 0.2060546875, "learning_rate": 0.00019765503792275354, "loss": 2.1616, "step": 1915 }, { "epoch": 0.33, "grad_norm": 0.2001953125, "learning_rate": 0.0001976231089919549, "loss": 2.171, "step": 1920 }, { "epoch": 0.33, "grad_norm": 0.19140625, "learning_rate": 0.00019759096677093334, "loss": 2.1726, "step": 1925 }, { "epoch": 0.33, "grad_norm": 0.189453125, "learning_rate": 0.00019755861132991412, "loss": 2.1745, "step": 1930 }, { "epoch": 0.33, "grad_norm": 0.1953125, "learning_rate": 0.0001975260427395886, "loss": 2.1956, "step": 1935 }, { "epoch": 0.33, "grad_norm": 0.19140625, "learning_rate": 0.00019749326107111362, "loss": 2.2004, "step": 1940 }, { "epoch": 0.33, "grad_norm": 0.1904296875, "learning_rate": 0.00019746026639611174, "loss": 2.1805, "step": 1945 }, { "epoch": 0.33, "grad_norm": 0.1923828125, "learning_rate": 0.00019742705878667075, "loss": 2.2056, "step": 1950 }, { "epoch": 0.33, "grad_norm": 0.1962890625, "learning_rate": 0.0001973936383153438, "loss": 2.1754, "step": 1955 }, { "epoch": 0.33, "grad_norm": 0.1875, "learning_rate": 0.00019736000505514908, "loss": 2.1286, "step": 1960 }, { "epoch": 0.33, "grad_norm": 0.1962890625, "learning_rate": 0.0001973261590795696, "loss": 2.1644, "step": 1965 }, { "epoch": 0.33, "grad_norm": 0.1923828125, "learning_rate": 0.00019729210046255316, "loss": 2.2054, "step": 1970 }, { "epoch": 0.33, "grad_norm": 0.1943359375, "learning_rate": 0.0001972578292785122, "loss": 2.2077, "step": 1975 }, { "epoch": 0.34, "grad_norm": 0.1884765625, "learning_rate": 0.00019722334560232354, "loss": 2.1545, "step": 1980 }, { "epoch": 0.34, "grad_norm": 0.1845703125, "learning_rate": 0.00019718864950932826, "loss": 2.1974, "step": 1985 }, { "epoch": 0.34, "grad_norm": 0.1884765625, "learning_rate": 0.00019715374107533157, "loss": 2.1435, "step": 1990 }, { "epoch": 0.34, "grad_norm": 0.1904296875, "learning_rate": 0.00019711862037660253, "loss": 2.195, "step": 1995 }, { "epoch": 0.34, "grad_norm": 0.1884765625, "learning_rate": 0.00019708328748987403, "loss": 2.2048, "step": 2000 }, { "epoch": 0.34, "grad_norm": 0.1845703125, "learning_rate": 0.00019704774249234256, "loss": 2.2101, "step": 2005 }, { "epoch": 0.34, "grad_norm": 0.1962890625, "learning_rate": 0.00019701198546166803, "loss": 2.2184, "step": 2010 }, { "epoch": 0.34, "grad_norm": 0.19140625, "learning_rate": 0.0001969760164759735, "loss": 2.1553, "step": 2015 }, { "epoch": 0.34, "grad_norm": 0.1953125, "learning_rate": 0.0001969398356138453, "loss": 2.1782, "step": 2020 }, { "epoch": 0.34, "grad_norm": 0.1923828125, "learning_rate": 0.00019690344295433256, "loss": 2.1714, "step": 2025 }, { "epoch": 0.34, "grad_norm": 0.1904296875, "learning_rate": 0.00019686683857694716, "loss": 2.1662, "step": 2030 }, { "epoch": 0.34, "grad_norm": 0.193359375, "learning_rate": 0.0001968300225616636, "loss": 2.1654, "step": 2035 }, { "epoch": 0.35, "grad_norm": 0.1904296875, "learning_rate": 0.00019679299498891873, "loss": 2.2053, "step": 2040 }, { "epoch": 0.35, "grad_norm": 0.189453125, "learning_rate": 0.00019675575593961156, "loss": 2.1423, "step": 2045 }, { "epoch": 0.35, "grad_norm": 0.205078125, "learning_rate": 0.0001967183054951033, "loss": 2.1537, "step": 2050 }, { "epoch": 0.35, "grad_norm": 0.19140625, "learning_rate": 0.00019668064373721685, "loss": 2.2083, "step": 2055 }, { "epoch": 0.35, "grad_norm": 0.1875, "learning_rate": 0.00019664277074823693, "loss": 2.164, "step": 2060 }, { "epoch": 0.35, "grad_norm": 0.189453125, "learning_rate": 0.0001966046866109097, "loss": 2.1678, "step": 2065 }, { "epoch": 0.35, "grad_norm": 0.19140625, "learning_rate": 0.00019656639140844262, "loss": 2.2032, "step": 2070 }, { "epoch": 0.35, "grad_norm": 0.193359375, "learning_rate": 0.00019652788522450437, "loss": 2.2068, "step": 2075 }, { "epoch": 0.35, "grad_norm": 0.1875, "learning_rate": 0.00019648916814322446, "loss": 2.1622, "step": 2080 }, { "epoch": 0.35, "grad_norm": 0.19140625, "learning_rate": 0.00019645024024919337, "loss": 2.2037, "step": 2085 }, { "epoch": 0.35, "grad_norm": 0.1826171875, "learning_rate": 0.00019641110162746202, "loss": 2.1631, "step": 2090 }, { "epoch": 0.35, "grad_norm": 0.19921875, "learning_rate": 0.00019637175236354175, "loss": 2.2035, "step": 2095 }, { "epoch": 0.36, "grad_norm": 0.2021484375, "learning_rate": 0.00019633219254340417, "loss": 2.1476, "step": 2100 }, { "epoch": 0.36, "grad_norm": 0.1953125, "learning_rate": 0.00019629242225348086, "loss": 2.1799, "step": 2105 }, { "epoch": 0.36, "grad_norm": 0.19140625, "learning_rate": 0.00019625244158066332, "loss": 2.2112, "step": 2110 }, { "epoch": 0.36, "grad_norm": 0.1962890625, "learning_rate": 0.0001962122506123026, "loss": 2.1967, "step": 2115 }, { "epoch": 0.36, "grad_norm": 0.1943359375, "learning_rate": 0.00019617184943620936, "loss": 2.1841, "step": 2120 }, { "epoch": 0.36, "grad_norm": 0.1875, "learning_rate": 0.00019613123814065335, "loss": 2.2235, "step": 2125 }, { "epoch": 0.36, "grad_norm": 0.189453125, "learning_rate": 0.00019609041681436354, "loss": 2.1743, "step": 2130 }, { "epoch": 0.36, "grad_norm": 0.1904296875, "learning_rate": 0.00019604938554652765, "loss": 2.1865, "step": 2135 }, { "epoch": 0.36, "grad_norm": 0.1884765625, "learning_rate": 0.00019600814442679226, "loss": 2.143, "step": 2140 }, { "epoch": 0.36, "grad_norm": 0.205078125, "learning_rate": 0.00019596669354526224, "loss": 2.2324, "step": 2145 }, { "epoch": 0.36, "grad_norm": 0.1982421875, "learning_rate": 0.00019592503299250096, "loss": 2.2198, "step": 2150 }, { "epoch": 0.36, "grad_norm": 0.193359375, "learning_rate": 0.0001958831628595297, "loss": 2.1736, "step": 2155 }, { "epoch": 0.37, "grad_norm": 0.189453125, "learning_rate": 0.00019584108323782777, "loss": 2.1709, "step": 2160 }, { "epoch": 0.37, "grad_norm": 0.1904296875, "learning_rate": 0.0001957987942193321, "loss": 2.1806, "step": 2165 }, { "epoch": 0.37, "grad_norm": 0.1884765625, "learning_rate": 0.00019575629589643718, "loss": 2.1568, "step": 2170 }, { "epoch": 0.37, "grad_norm": 0.1845703125, "learning_rate": 0.00019571358836199476, "loss": 2.1647, "step": 2175 }, { "epoch": 0.37, "grad_norm": 0.1943359375, "learning_rate": 0.00019567067170931366, "loss": 2.2088, "step": 2180 }, { "epoch": 0.37, "grad_norm": 0.19140625, "learning_rate": 0.00019562754603215962, "loss": 2.1749, "step": 2185 }, { "epoch": 0.37, "grad_norm": 0.197265625, "learning_rate": 0.00019558421142475507, "loss": 2.1569, "step": 2190 }, { "epoch": 0.37, "grad_norm": 0.193359375, "learning_rate": 0.0001955406679817789, "loss": 2.1758, "step": 2195 }, { "epoch": 0.37, "grad_norm": 0.193359375, "learning_rate": 0.00019549691579836626, "loss": 2.2226, "step": 2200 }, { "epoch": 0.37, "grad_norm": 0.201171875, "learning_rate": 0.00019545295497010843, "loss": 2.1599, "step": 2205 }, { "epoch": 0.37, "grad_norm": 0.19921875, "learning_rate": 0.0001954087855930524, "loss": 2.1589, "step": 2210 }, { "epoch": 0.38, "grad_norm": 0.1923828125, "learning_rate": 0.000195364407763701, "loss": 2.1723, "step": 2215 }, { "epoch": 0.38, "grad_norm": 0.19140625, "learning_rate": 0.00019531982157901232, "loss": 2.1533, "step": 2220 }, { "epoch": 0.38, "grad_norm": 0.193359375, "learning_rate": 0.00019527502713639975, "loss": 2.1804, "step": 2225 }, { "epoch": 0.38, "grad_norm": 0.2021484375, "learning_rate": 0.00019523002453373175, "loss": 2.163, "step": 2230 }, { "epoch": 0.38, "grad_norm": 0.193359375, "learning_rate": 0.0001951848138693314, "loss": 2.1807, "step": 2235 }, { "epoch": 0.38, "grad_norm": 0.2001953125, "learning_rate": 0.00019513939524197656, "loss": 2.1523, "step": 2240 }, { "epoch": 0.38, "grad_norm": 0.1875, "learning_rate": 0.0001950937687508993, "loss": 2.1963, "step": 2245 }, { "epoch": 0.38, "grad_norm": 0.2001953125, "learning_rate": 0.00019504793449578593, "loss": 2.171, "step": 2250 }, { "epoch": 0.38, "grad_norm": 0.1923828125, "learning_rate": 0.00019500189257677666, "loss": 2.1529, "step": 2255 }, { "epoch": 0.38, "grad_norm": 0.1943359375, "learning_rate": 0.0001949556430944654, "loss": 2.1683, "step": 2260 }, { "epoch": 0.38, "grad_norm": 0.1904296875, "learning_rate": 0.00019490918614989956, "loss": 2.1611, "step": 2265 }, { "epoch": 0.38, "grad_norm": 0.1923828125, "learning_rate": 0.00019486252184457977, "loss": 2.1865, "step": 2270 }, { "epoch": 0.39, "grad_norm": 0.1904296875, "learning_rate": 0.00019481565028045986, "loss": 2.1827, "step": 2275 }, { "epoch": 0.39, "grad_norm": 0.19140625, "learning_rate": 0.00019476857155994635, "loss": 2.1502, "step": 2280 }, { "epoch": 0.39, "grad_norm": 0.1923828125, "learning_rate": 0.00019472128578589833, "loss": 2.1553, "step": 2285 }, { "epoch": 0.39, "grad_norm": 0.1962890625, "learning_rate": 0.00019467379306162746, "loss": 2.2209, "step": 2290 }, { "epoch": 0.39, "grad_norm": 0.19140625, "learning_rate": 0.0001946260934908973, "loss": 2.202, "step": 2295 }, { "epoch": 0.39, "grad_norm": 0.1923828125, "learning_rate": 0.00019457818717792357, "loss": 2.1814, "step": 2300 }, { "epoch": 0.39, "grad_norm": 0.19140625, "learning_rate": 0.0001945300742273735, "loss": 2.1992, "step": 2305 }, { "epoch": 0.39, "grad_norm": 0.1923828125, "learning_rate": 0.00019448175474436592, "loss": 2.1637, "step": 2310 }, { "epoch": 0.39, "grad_norm": 0.1923828125, "learning_rate": 0.00019443322883447078, "loss": 2.1961, "step": 2315 }, { "epoch": 0.39, "grad_norm": 0.1904296875, "learning_rate": 0.00019438449660370922, "loss": 2.1988, "step": 2320 }, { "epoch": 0.39, "grad_norm": 0.1923828125, "learning_rate": 0.00019433555815855292, "loss": 2.1567, "step": 2325 }, { "epoch": 0.39, "grad_norm": 0.1943359375, "learning_rate": 0.0001942864136059243, "loss": 2.1266, "step": 2330 }, { "epoch": 0.4, "grad_norm": 0.19921875, "learning_rate": 0.000194237063053196, "loss": 2.1608, "step": 2335 }, { "epoch": 0.4, "grad_norm": 0.189453125, "learning_rate": 0.00019418750660819074, "loss": 2.1657, "step": 2340 }, { "epoch": 0.4, "grad_norm": 0.189453125, "learning_rate": 0.0001941377443791811, "loss": 2.1726, "step": 2345 }, { "epoch": 0.4, "grad_norm": 0.193359375, "learning_rate": 0.00019408777647488928, "loss": 2.2402, "step": 2350 }, { "epoch": 0.4, "grad_norm": 0.197265625, "learning_rate": 0.00019403760300448677, "loss": 2.1513, "step": 2355 }, { "epoch": 0.4, "grad_norm": 0.193359375, "learning_rate": 0.0001939872240775943, "loss": 2.1811, "step": 2360 }, { "epoch": 0.4, "grad_norm": 0.2001953125, "learning_rate": 0.0001939366398042814, "loss": 2.1592, "step": 2365 }, { "epoch": 0.4, "grad_norm": 0.2060546875, "learning_rate": 0.00019388585029506627, "loss": 2.1665, "step": 2370 }, { "epoch": 0.4, "grad_norm": 0.193359375, "learning_rate": 0.00019383485566091554, "loss": 2.1636, "step": 2375 }, { "epoch": 0.4, "grad_norm": 0.1884765625, "learning_rate": 0.000193783656013244, "loss": 2.144, "step": 2380 }, { "epoch": 0.4, "grad_norm": 0.1982421875, "learning_rate": 0.0001937322514639143, "loss": 2.1331, "step": 2385 }, { "epoch": 0.4, "grad_norm": 0.1904296875, "learning_rate": 0.00019368064212523686, "loss": 2.1441, "step": 2390 }, { "epoch": 0.41, "grad_norm": 0.1953125, "learning_rate": 0.0001936288281099694, "loss": 2.2009, "step": 2395 }, { "epoch": 0.41, "grad_norm": 0.19140625, "learning_rate": 0.00019357680953131703, "loss": 2.1558, "step": 2400 }, { "epoch": 0.41, "grad_norm": 0.1865234375, "learning_rate": 0.0001935245865029316, "loss": 2.1831, "step": 2405 }, { "epoch": 0.41, "grad_norm": 0.19921875, "learning_rate": 0.00019347215913891175, "loss": 2.1691, "step": 2410 }, { "epoch": 0.41, "grad_norm": 0.1923828125, "learning_rate": 0.00019341952755380252, "loss": 2.1821, "step": 2415 }, { "epoch": 0.41, "grad_norm": 0.197265625, "learning_rate": 0.00019336669186259515, "loss": 2.1822, "step": 2420 }, { "epoch": 0.41, "grad_norm": 0.19140625, "learning_rate": 0.00019331365218072682, "loss": 2.2013, "step": 2425 }, { "epoch": 0.41, "grad_norm": 0.189453125, "learning_rate": 0.0001932604086240804, "loss": 2.2009, "step": 2430 }, { "epoch": 0.41, "grad_norm": 0.2021484375, "learning_rate": 0.00019320696130898418, "loss": 2.0917, "step": 2435 }, { "epoch": 0.41, "grad_norm": 0.185546875, "learning_rate": 0.00019315331035221162, "loss": 2.1562, "step": 2440 }, { "epoch": 0.41, "grad_norm": 0.2001953125, "learning_rate": 0.00019309945587098117, "loss": 2.1827, "step": 2445 }, { "epoch": 0.41, "grad_norm": 0.19921875, "learning_rate": 0.00019304539798295587, "loss": 2.1584, "step": 2450 }, { "epoch": 0.42, "grad_norm": 0.197265625, "learning_rate": 0.0001929911368062432, "loss": 2.158, "step": 2455 }, { "epoch": 0.42, "grad_norm": 0.193359375, "learning_rate": 0.00019293667245939475, "loss": 2.171, "step": 2460 }, { "epoch": 0.42, "grad_norm": 0.1962890625, "learning_rate": 0.0001928820050614061, "loss": 2.1782, "step": 2465 }, { "epoch": 0.42, "grad_norm": 0.1962890625, "learning_rate": 0.00019282713473171633, "loss": 2.2018, "step": 2470 }, { "epoch": 0.42, "grad_norm": 0.193359375, "learning_rate": 0.00019277206159020805, "loss": 2.1583, "step": 2475 }, { "epoch": 0.42, "grad_norm": 0.2138671875, "learning_rate": 0.00019271678575720683, "loss": 2.1846, "step": 2480 }, { "epoch": 0.42, "grad_norm": 0.1953125, "learning_rate": 0.00019266130735348118, "loss": 2.1489, "step": 2485 }, { "epoch": 0.42, "grad_norm": 0.203125, "learning_rate": 0.0001926056265002422, "loss": 2.1503, "step": 2490 }, { "epoch": 0.42, "grad_norm": 0.1884765625, "learning_rate": 0.00019254974331914322, "loss": 2.1489, "step": 2495 }, { "epoch": 0.42, "grad_norm": 0.2001953125, "learning_rate": 0.00019249365793227966, "loss": 2.2092, "step": 2500 }, { "epoch": 0.42, "grad_norm": 0.197265625, "learning_rate": 0.0001924373704621888, "loss": 2.1788, "step": 2505 }, { "epoch": 0.43, "grad_norm": 0.1943359375, "learning_rate": 0.0001923808810318494, "loss": 2.1331, "step": 2510 }, { "epoch": 0.43, "grad_norm": 0.1943359375, "learning_rate": 0.00019232418976468133, "loss": 2.1295, "step": 2515 }, { "epoch": 0.43, "grad_norm": 0.1953125, "learning_rate": 0.0001922672967845457, "loss": 2.1849, "step": 2520 }, { "epoch": 0.43, "grad_norm": 0.1953125, "learning_rate": 0.00019221020221574413, "loss": 2.1991, "step": 2525 }, { "epoch": 0.43, "grad_norm": 0.193359375, "learning_rate": 0.00019215290618301875, "loss": 2.1679, "step": 2530 }, { "epoch": 0.43, "grad_norm": 0.1953125, "learning_rate": 0.00019209540881155176, "loss": 2.1439, "step": 2535 }, { "epoch": 0.43, "grad_norm": 0.1943359375, "learning_rate": 0.00019203771022696547, "loss": 2.1732, "step": 2540 }, { "epoch": 0.43, "grad_norm": 0.19140625, "learning_rate": 0.00019197981055532156, "loss": 2.1724, "step": 2545 }, { "epoch": 0.43, "grad_norm": 0.1953125, "learning_rate": 0.00019192170992312125, "loss": 2.1703, "step": 2550 }, { "epoch": 0.43, "grad_norm": 0.1953125, "learning_rate": 0.00019186340845730467, "loss": 2.1369, "step": 2555 }, { "epoch": 0.43, "grad_norm": 0.193359375, "learning_rate": 0.00019180490628525082, "loss": 2.1925, "step": 2560 }, { "epoch": 0.43, "grad_norm": 0.1943359375, "learning_rate": 0.00019174620353477724, "loss": 2.1806, "step": 2565 }, { "epoch": 0.44, "grad_norm": 0.1923828125, "learning_rate": 0.0001916873003341396, "loss": 2.1636, "step": 2570 }, { "epoch": 0.44, "grad_norm": 0.1923828125, "learning_rate": 0.0001916281968120316, "loss": 2.1723, "step": 2575 }, { "epoch": 0.44, "grad_norm": 0.1943359375, "learning_rate": 0.0001915688930975846, "loss": 2.1838, "step": 2580 }, { "epoch": 0.44, "grad_norm": 0.1943359375, "learning_rate": 0.0001915093893203673, "loss": 2.1576, "step": 2585 }, { "epoch": 0.44, "grad_norm": 0.189453125, "learning_rate": 0.00019144968561038558, "loss": 2.1672, "step": 2590 }, { "epoch": 0.44, "grad_norm": 0.197265625, "learning_rate": 0.00019138978209808208, "loss": 2.1246, "step": 2595 }, { "epoch": 0.44, "grad_norm": 0.19921875, "learning_rate": 0.00019132967891433595, "loss": 2.1887, "step": 2600 }, { "epoch": 0.44, "grad_norm": 0.201171875, "learning_rate": 0.00019126937619046267, "loss": 2.2243, "step": 2605 }, { "epoch": 0.44, "grad_norm": 0.2021484375, "learning_rate": 0.00019120887405821361, "loss": 2.1627, "step": 2610 }, { "epoch": 0.44, "grad_norm": 0.19140625, "learning_rate": 0.00019114817264977588, "loss": 2.1638, "step": 2615 }, { "epoch": 0.44, "grad_norm": 0.2021484375, "learning_rate": 0.00019108727209777196, "loss": 2.1382, "step": 2620 }, { "epoch": 0.44, "grad_norm": 0.197265625, "learning_rate": 0.00019102617253525934, "loss": 2.1539, "step": 2625 }, { "epoch": 0.45, "grad_norm": 0.189453125, "learning_rate": 0.00019096487409573043, "loss": 2.1688, "step": 2630 }, { "epoch": 0.45, "grad_norm": 0.1943359375, "learning_rate": 0.00019090337691311207, "loss": 2.1974, "step": 2635 }, { "epoch": 0.45, "grad_norm": 0.1904296875, "learning_rate": 0.0001908416811217654, "loss": 2.178, "step": 2640 }, { "epoch": 0.45, "grad_norm": 0.1953125, "learning_rate": 0.0001907797868564854, "loss": 2.1297, "step": 2645 }, { "epoch": 0.45, "grad_norm": 0.19140625, "learning_rate": 0.00019071769425250075, "loss": 2.161, "step": 2650 }, { "epoch": 0.45, "grad_norm": 0.1982421875, "learning_rate": 0.00019065540344547342, "loss": 2.1568, "step": 2655 }, { "epoch": 0.45, "grad_norm": 0.2080078125, "learning_rate": 0.00019059291457149846, "loss": 2.2083, "step": 2660 }, { "epoch": 0.45, "grad_norm": 0.2021484375, "learning_rate": 0.00019053022776710363, "loss": 2.1752, "step": 2665 }, { "epoch": 0.45, "grad_norm": 0.189453125, "learning_rate": 0.0001904673431692492, "loss": 2.145, "step": 2670 }, { "epoch": 0.45, "grad_norm": 0.19140625, "learning_rate": 0.00019040426091532743, "loss": 2.1651, "step": 2675 }, { "epoch": 0.45, "grad_norm": 0.1943359375, "learning_rate": 0.00019034098114316264, "loss": 2.2082, "step": 2680 }, { "epoch": 0.45, "grad_norm": 0.1982421875, "learning_rate": 0.00019027750399101053, "loss": 2.1772, "step": 2685 }, { "epoch": 0.46, "grad_norm": 0.201171875, "learning_rate": 0.00019021382959755808, "loss": 2.2035, "step": 2690 }, { "epoch": 0.46, "grad_norm": 0.19140625, "learning_rate": 0.00019014995810192332, "loss": 2.1952, "step": 2695 }, { "epoch": 0.46, "grad_norm": 0.1953125, "learning_rate": 0.0001900858896436547, "loss": 2.0956, "step": 2700 }, { "epoch": 0.46, "grad_norm": 0.1962890625, "learning_rate": 0.0001900216243627312, "loss": 2.1508, "step": 2705 }, { "epoch": 0.46, "grad_norm": 0.2021484375, "learning_rate": 0.00018995716239956175, "loss": 2.2125, "step": 2710 }, { "epoch": 0.46, "grad_norm": 0.193359375, "learning_rate": 0.00018989250389498497, "loss": 2.15, "step": 2715 }, { "epoch": 0.46, "grad_norm": 0.1953125, "learning_rate": 0.0001898276489902689, "loss": 2.1861, "step": 2720 }, { "epoch": 0.46, "grad_norm": 0.1904296875, "learning_rate": 0.00018976259782711074, "loss": 2.1673, "step": 2725 }, { "epoch": 0.46, "grad_norm": 0.197265625, "learning_rate": 0.00018969735054763645, "loss": 2.1716, "step": 2730 }, { "epoch": 0.46, "grad_norm": 0.1923828125, "learning_rate": 0.0001896319072944004, "loss": 2.186, "step": 2735 }, { "epoch": 0.46, "grad_norm": 0.197265625, "learning_rate": 0.00018956626821038522, "loss": 2.2132, "step": 2740 }, { "epoch": 0.46, "grad_norm": 0.197265625, "learning_rate": 0.00018950043343900138, "loss": 2.1679, "step": 2745 }, { "epoch": 0.47, "grad_norm": 0.1953125, "learning_rate": 0.0001894344031240869, "loss": 2.169, "step": 2750 }, { "epoch": 0.47, "grad_norm": 0.1923828125, "learning_rate": 0.00018936817740990692, "loss": 2.1564, "step": 2755 }, { "epoch": 0.47, "grad_norm": 0.197265625, "learning_rate": 0.00018930175644115373, "loss": 2.1463, "step": 2760 }, { "epoch": 0.47, "grad_norm": 0.193359375, "learning_rate": 0.00018923514036294598, "loss": 2.1655, "step": 2765 }, { "epoch": 0.47, "grad_norm": 0.2041015625, "learning_rate": 0.00018916832932082872, "loss": 2.1705, "step": 2770 }, { "epoch": 0.47, "grad_norm": 0.203125, "learning_rate": 0.00018910132346077295, "loss": 2.1628, "step": 2775 }, { "epoch": 0.47, "grad_norm": 0.19921875, "learning_rate": 0.0001890341229291753, "loss": 2.1291, "step": 2780 }, { "epoch": 0.47, "grad_norm": 0.1943359375, "learning_rate": 0.00018896672787285774, "loss": 2.1664, "step": 2785 }, { "epoch": 0.47, "grad_norm": 0.1943359375, "learning_rate": 0.00018889913843906725, "loss": 2.1971, "step": 2790 }, { "epoch": 0.47, "grad_norm": 0.201171875, "learning_rate": 0.00018883135477547542, "loss": 2.1711, "step": 2795 }, { "epoch": 0.47, "grad_norm": 0.19921875, "learning_rate": 0.0001887633770301783, "loss": 2.169, "step": 2800 }, { "epoch": 0.48, "grad_norm": 0.1962890625, "learning_rate": 0.00018869520535169597, "loss": 2.1618, "step": 2805 }, { "epoch": 0.48, "grad_norm": 0.1904296875, "learning_rate": 0.00018862683988897212, "loss": 2.1426, "step": 2810 }, { "epoch": 0.48, "grad_norm": 0.2001953125, "learning_rate": 0.0001885582807913739, "loss": 2.1659, "step": 2815 }, { "epoch": 0.48, "grad_norm": 0.19921875, "learning_rate": 0.00018848952820869154, "loss": 2.1803, "step": 2820 }, { "epoch": 0.48, "grad_norm": 0.1923828125, "learning_rate": 0.00018842058229113796, "loss": 2.1246, "step": 2825 }, { "epoch": 0.48, "grad_norm": 0.19921875, "learning_rate": 0.00018835144318934854, "loss": 2.167, "step": 2830 }, { "epoch": 0.48, "grad_norm": 0.201171875, "learning_rate": 0.0001882821110543806, "loss": 2.1674, "step": 2835 }, { "epoch": 0.48, "grad_norm": 0.1943359375, "learning_rate": 0.0001882125860377134, "loss": 2.147, "step": 2840 }, { "epoch": 0.48, "grad_norm": 0.2060546875, "learning_rate": 0.00018814286829124747, "loss": 2.1274, "step": 2845 }, { "epoch": 0.48, "grad_norm": 0.1923828125, "learning_rate": 0.00018807295796730445, "loss": 2.1769, "step": 2850 }, { "epoch": 0.48, "grad_norm": 0.19921875, "learning_rate": 0.00018800285521862679, "loss": 2.1788, "step": 2855 }, { "epoch": 0.48, "grad_norm": 0.1962890625, "learning_rate": 0.00018793256019837727, "loss": 2.1786, "step": 2860 }, { "epoch": 0.49, "grad_norm": 0.197265625, "learning_rate": 0.00018786207306013882, "loss": 2.1968, "step": 2865 }, { "epoch": 0.49, "grad_norm": 0.203125, "learning_rate": 0.00018779139395791407, "loss": 2.1675, "step": 2870 }, { "epoch": 0.49, "grad_norm": 0.1962890625, "learning_rate": 0.00018772052304612507, "loss": 2.1596, "step": 2875 }, { "epoch": 0.49, "grad_norm": 0.2041015625, "learning_rate": 0.000187649460479613, "loss": 2.1348, "step": 2880 }, { "epoch": 0.49, "grad_norm": 0.193359375, "learning_rate": 0.0001875782064136377, "loss": 2.1215, "step": 2885 }, { "epoch": 0.49, "grad_norm": 0.19921875, "learning_rate": 0.00018750676100387742, "loss": 2.2065, "step": 2890 }, { "epoch": 0.49, "grad_norm": 0.1943359375, "learning_rate": 0.00018743512440642845, "loss": 2.1686, "step": 2895 }, { "epoch": 0.49, "grad_norm": 0.203125, "learning_rate": 0.00018736329677780487, "loss": 2.1854, "step": 2900 }, { "epoch": 0.49, "grad_norm": 0.1904296875, "learning_rate": 0.00018729127827493805, "loss": 2.1674, "step": 2905 }, { "epoch": 0.49, "grad_norm": 0.1953125, "learning_rate": 0.0001872190690551764, "loss": 2.1876, "step": 2910 }, { "epoch": 0.49, "grad_norm": 0.1982421875, "learning_rate": 0.00018714666927628504, "loss": 2.1409, "step": 2915 }, { "epoch": 0.49, "grad_norm": 0.197265625, "learning_rate": 0.00018707407909644542, "loss": 2.1408, "step": 2920 }, { "epoch": 0.5, "grad_norm": 0.205078125, "learning_rate": 0.00018700129867425504, "loss": 2.2294, "step": 2925 }, { "epoch": 0.5, "grad_norm": 0.212890625, "learning_rate": 0.0001869283281687269, "loss": 2.1731, "step": 2930 }, { "epoch": 0.5, "grad_norm": 0.1982421875, "learning_rate": 0.00018685516773928943, "loss": 2.1667, "step": 2935 }, { "epoch": 0.5, "grad_norm": 0.1982421875, "learning_rate": 0.00018678181754578602, "loss": 2.1689, "step": 2940 }, { "epoch": 0.5, "grad_norm": 0.1982421875, "learning_rate": 0.00018670827774847456, "loss": 2.2155, "step": 2945 }, { "epoch": 0.5, "grad_norm": 0.2158203125, "learning_rate": 0.00018663454850802728, "loss": 2.1756, "step": 2950 }, { "epoch": 0.5, "grad_norm": 0.19921875, "learning_rate": 0.0001865606299855303, "loss": 2.1609, "step": 2955 }, { "epoch": 0.5, "grad_norm": 0.19140625, "learning_rate": 0.0001864865223424832, "loss": 2.1553, "step": 2960 }, { "epoch": 0.5, "grad_norm": 0.193359375, "learning_rate": 0.0001864122257407989, "loss": 2.1826, "step": 2965 }, { "epoch": 0.5, "grad_norm": 0.1884765625, "learning_rate": 0.00018633774034280306, "loss": 2.1677, "step": 2970 }, { "epoch": 0.5, "grad_norm": 0.203125, "learning_rate": 0.00018626306631123386, "loss": 2.156, "step": 2975 }, { "epoch": 0.5, "grad_norm": 0.19921875, "learning_rate": 0.00018618820380924165, "loss": 2.1514, "step": 2980 }, { "epoch": 0.51, "grad_norm": 0.2041015625, "learning_rate": 0.00018611315300038847, "loss": 2.1479, "step": 2985 }, { "epoch": 0.51, "grad_norm": 0.201171875, "learning_rate": 0.00018603791404864784, "loss": 2.1405, "step": 2990 }, { "epoch": 0.51, "grad_norm": 0.212890625, "learning_rate": 0.00018596248711840436, "loss": 2.1531, "step": 2995 }, { "epoch": 0.51, "grad_norm": 0.1982421875, "learning_rate": 0.0001858868723744533, "loss": 2.1746, "step": 3000 }, { "epoch": 0.51, "grad_norm": 0.1953125, "learning_rate": 0.00018581106998200023, "loss": 2.1487, "step": 3005 }, { "epoch": 0.51, "grad_norm": 0.21484375, "learning_rate": 0.00018573508010666078, "loss": 2.2017, "step": 3010 }, { "epoch": 0.51, "grad_norm": 0.1962890625, "learning_rate": 0.00018565890291446014, "loss": 2.1301, "step": 3015 }, { "epoch": 0.51, "grad_norm": 0.1982421875, "learning_rate": 0.00018558253857183277, "loss": 2.139, "step": 3020 }, { "epoch": 0.51, "grad_norm": 0.2021484375, "learning_rate": 0.0001855059872456221, "loss": 2.1775, "step": 3025 }, { "epoch": 0.51, "grad_norm": 0.1962890625, "learning_rate": 0.00018542924910307996, "loss": 2.1787, "step": 3030 }, { "epoch": 0.51, "grad_norm": 0.1923828125, "learning_rate": 0.0001853523243118664, "loss": 2.14, "step": 3035 }, { "epoch": 0.51, "grad_norm": 0.203125, "learning_rate": 0.00018527521304004932, "loss": 2.1609, "step": 3040 }, { "epoch": 0.52, "grad_norm": 0.1943359375, "learning_rate": 0.00018519791545610392, "loss": 2.1944, "step": 3045 }, { "epoch": 0.52, "grad_norm": 0.2021484375, "learning_rate": 0.0001851204317289126, "loss": 2.1888, "step": 3050 }, { "epoch": 0.52, "grad_norm": 0.1953125, "learning_rate": 0.00018504276202776438, "loss": 2.1624, "step": 3055 }, { "epoch": 0.52, "grad_norm": 0.19921875, "learning_rate": 0.00018496490652235455, "loss": 2.1327, "step": 3060 }, { "epoch": 0.52, "grad_norm": 0.2001953125, "learning_rate": 0.00018488686538278452, "loss": 2.154, "step": 3065 }, { "epoch": 0.52, "grad_norm": 0.1875, "learning_rate": 0.0001848086387795611, "loss": 2.1481, "step": 3070 }, { "epoch": 0.52, "grad_norm": 0.2001953125, "learning_rate": 0.0001847302268835964, "loss": 2.1466, "step": 3075 }, { "epoch": 0.52, "grad_norm": 0.197265625, "learning_rate": 0.00018465162986620737, "loss": 2.1797, "step": 3080 }, { "epoch": 0.52, "grad_norm": 0.19921875, "learning_rate": 0.00018457284789911532, "loss": 2.1701, "step": 3085 }, { "epoch": 0.52, "grad_norm": 0.1962890625, "learning_rate": 0.00018449388115444578, "loss": 2.1868, "step": 3090 }, { "epoch": 0.52, "grad_norm": 0.1982421875, "learning_rate": 0.00018441472980472795, "loss": 2.1842, "step": 3095 }, { "epoch": 0.53, "grad_norm": 0.1982421875, "learning_rate": 0.00018433539402289427, "loss": 2.1489, "step": 3100 }, { "epoch": 0.53, "grad_norm": 0.2041015625, "learning_rate": 0.00018425587398228021, "loss": 2.1478, "step": 3105 }, { "epoch": 0.53, "grad_norm": 0.2021484375, "learning_rate": 0.00018417616985662386, "loss": 2.166, "step": 3110 }, { "epoch": 0.53, "grad_norm": 0.189453125, "learning_rate": 0.0001840962818200654, "loss": 2.1668, "step": 3115 }, { "epoch": 0.53, "grad_norm": 0.2001953125, "learning_rate": 0.0001840162100471469, "loss": 2.1575, "step": 3120 }, { "epoch": 0.53, "grad_norm": 0.2060546875, "learning_rate": 0.00018393595471281182, "loss": 2.1573, "step": 3125 }, { "epoch": 0.53, "grad_norm": 0.1982421875, "learning_rate": 0.00018385551599240472, "loss": 2.2463, "step": 3130 }, { "epoch": 0.53, "grad_norm": 0.1923828125, "learning_rate": 0.00018377489406167077, "loss": 2.1743, "step": 3135 }, { "epoch": 0.53, "grad_norm": 0.201171875, "learning_rate": 0.00018369408909675543, "loss": 2.1865, "step": 3140 }, { "epoch": 0.53, "grad_norm": 0.2041015625, "learning_rate": 0.00018361310127420417, "loss": 2.1548, "step": 3145 }, { "epoch": 0.53, "grad_norm": 0.197265625, "learning_rate": 0.00018353193077096178, "loss": 2.1521, "step": 3150 }, { "epoch": 0.53, "grad_norm": 0.2021484375, "learning_rate": 0.00018345057776437233, "loss": 2.1347, "step": 3155 }, { "epoch": 0.54, "grad_norm": 0.1982421875, "learning_rate": 0.0001833690424321786, "loss": 2.176, "step": 3160 }, { "epoch": 0.54, "grad_norm": 0.2001953125, "learning_rate": 0.00018328732495252167, "loss": 2.1327, "step": 3165 }, { "epoch": 0.54, "grad_norm": 0.203125, "learning_rate": 0.00018320542550394065, "loss": 2.1949, "step": 3170 }, { "epoch": 0.54, "grad_norm": 0.19921875, "learning_rate": 0.00018312334426537214, "loss": 2.1317, "step": 3175 }, { "epoch": 0.54, "grad_norm": 0.2001953125, "learning_rate": 0.00018304108141615, "loss": 2.1799, "step": 3180 }, { "epoch": 0.54, "grad_norm": 0.1943359375, "learning_rate": 0.0001829586371360048, "loss": 2.214, "step": 3185 }, { "epoch": 0.54, "grad_norm": 0.20703125, "learning_rate": 0.00018287601160506362, "loss": 2.1478, "step": 3190 }, { "epoch": 0.54, "grad_norm": 0.2001953125, "learning_rate": 0.00018279320500384942, "loss": 2.1804, "step": 3195 }, { "epoch": 0.54, "grad_norm": 0.19921875, "learning_rate": 0.00018271021751328084, "loss": 2.1779, "step": 3200 }, { "epoch": 0.54, "grad_norm": 0.19921875, "learning_rate": 0.00018262704931467174, "loss": 2.1433, "step": 3205 }, { "epoch": 0.54, "grad_norm": 0.201171875, "learning_rate": 0.00018254370058973072, "loss": 2.1722, "step": 3210 }, { "epoch": 0.54, "grad_norm": 0.193359375, "learning_rate": 0.0001824601715205609, "loss": 2.133, "step": 3215 }, { "epoch": 0.55, "grad_norm": 0.205078125, "learning_rate": 0.00018237646228965937, "loss": 2.1461, "step": 3220 }, { "epoch": 0.55, "grad_norm": 0.197265625, "learning_rate": 0.0001822925730799168, "loss": 2.1887, "step": 3225 }, { "epoch": 0.55, "grad_norm": 0.193359375, "learning_rate": 0.00018220850407461717, "loss": 2.1615, "step": 3230 }, { "epoch": 0.55, "grad_norm": 0.1923828125, "learning_rate": 0.0001821242554574373, "loss": 2.1579, "step": 3235 }, { "epoch": 0.55, "grad_norm": 0.208984375, "learning_rate": 0.00018203982741244628, "loss": 2.1899, "step": 3240 }, { "epoch": 0.55, "grad_norm": 0.208984375, "learning_rate": 0.00018195522012410536, "loss": 2.1738, "step": 3245 }, { "epoch": 0.55, "grad_norm": 0.21484375, "learning_rate": 0.00018187043377726735, "loss": 2.1169, "step": 3250 }, { "epoch": 0.55, "grad_norm": 0.2041015625, "learning_rate": 0.0001817854685571763, "loss": 2.1394, "step": 3255 }, { "epoch": 0.55, "grad_norm": 0.19921875, "learning_rate": 0.00018170032464946708, "loss": 2.1765, "step": 3260 }, { "epoch": 0.55, "grad_norm": 0.19921875, "learning_rate": 0.0001816150022401649, "loss": 2.174, "step": 3265 }, { "epoch": 0.55, "grad_norm": 0.1962890625, "learning_rate": 0.00018152950151568504, "loss": 2.1572, "step": 3270 }, { "epoch": 0.55, "grad_norm": 0.1962890625, "learning_rate": 0.0001814438226628323, "loss": 2.1492, "step": 3275 }, { "epoch": 0.56, "grad_norm": 0.2021484375, "learning_rate": 0.00018135796586880068, "loss": 2.1177, "step": 3280 }, { "epoch": 0.56, "grad_norm": 0.19921875, "learning_rate": 0.000181271931321173, "loss": 2.1699, "step": 3285 }, { "epoch": 0.56, "grad_norm": 0.1943359375, "learning_rate": 0.0001811857192079204, "loss": 2.1318, "step": 3290 }, { "epoch": 0.56, "grad_norm": 0.197265625, "learning_rate": 0.000181099329717402, "loss": 2.1738, "step": 3295 }, { "epoch": 0.56, "grad_norm": 0.1953125, "learning_rate": 0.00018101276303836438, "loss": 2.1476, "step": 3300 }, { "epoch": 0.56, "grad_norm": 0.1943359375, "learning_rate": 0.00018092601935994137, "loss": 2.1671, "step": 3305 }, { "epoch": 0.56, "grad_norm": 0.1982421875, "learning_rate": 0.0001808390988716534, "loss": 2.1648, "step": 3310 }, { "epoch": 0.56, "grad_norm": 0.197265625, "learning_rate": 0.0001807520017634073, "loss": 2.1088, "step": 3315 }, { "epoch": 0.56, "grad_norm": 0.1982421875, "learning_rate": 0.00018066472822549567, "loss": 2.154, "step": 3320 }, { "epoch": 0.56, "grad_norm": 0.208984375, "learning_rate": 0.00018057727844859672, "loss": 2.1864, "step": 3325 }, { "epoch": 0.56, "grad_norm": 0.205078125, "learning_rate": 0.00018048965262377358, "loss": 2.1863, "step": 3330 }, { "epoch": 0.56, "grad_norm": 0.19921875, "learning_rate": 0.00018040185094247413, "loss": 2.1081, "step": 3335 }, { "epoch": 0.57, "grad_norm": 0.203125, "learning_rate": 0.00018031387359653035, "loss": 2.1357, "step": 3340 }, { "epoch": 0.57, "grad_norm": 0.19921875, "learning_rate": 0.00018022572077815808, "loss": 2.1908, "step": 3345 }, { "epoch": 0.57, "grad_norm": 0.1943359375, "learning_rate": 0.00018013739267995659, "loss": 2.1439, "step": 3350 }, { "epoch": 0.57, "grad_norm": 0.2021484375, "learning_rate": 0.00018004888949490802, "loss": 2.2017, "step": 3355 }, { "epoch": 0.57, "grad_norm": 0.19921875, "learning_rate": 0.00017996021141637709, "loss": 2.1607, "step": 3360 }, { "epoch": 0.57, "grad_norm": 0.2080078125, "learning_rate": 0.00017987135863811062, "loss": 2.1538, "step": 3365 }, { "epoch": 0.57, "grad_norm": 0.19921875, "learning_rate": 0.0001797823313542371, "loss": 2.1318, "step": 3370 }, { "epoch": 0.57, "grad_norm": 0.203125, "learning_rate": 0.00017969312975926632, "loss": 2.1433, "step": 3375 }, { "epoch": 0.57, "grad_norm": 0.1923828125, "learning_rate": 0.0001796037540480889, "loss": 2.1633, "step": 3380 }, { "epoch": 0.57, "grad_norm": 0.2177734375, "learning_rate": 0.0001795142044159759, "loss": 2.1587, "step": 3385 }, { "epoch": 0.57, "grad_norm": 0.2021484375, "learning_rate": 0.0001794244810585783, "loss": 2.1575, "step": 3390 }, { "epoch": 0.57, "grad_norm": 0.2041015625, "learning_rate": 0.00017933458417192672, "loss": 2.1543, "step": 3395 }, { "epoch": 0.58, "grad_norm": 0.19921875, "learning_rate": 0.00017924451395243086, "loss": 2.1969, "step": 3400 }, { "epoch": 0.58, "grad_norm": 0.19921875, "learning_rate": 0.00017915427059687908, "loss": 2.1322, "step": 3405 }, { "epoch": 0.58, "grad_norm": 0.2001953125, "learning_rate": 0.00017906385430243817, "loss": 2.1745, "step": 3410 }, { "epoch": 0.58, "grad_norm": 0.205078125, "learning_rate": 0.0001789732652666526, "loss": 2.1668, "step": 3415 }, { "epoch": 0.58, "grad_norm": 0.193359375, "learning_rate": 0.00017888250368744437, "loss": 2.1606, "step": 3420 }, { "epoch": 0.58, "grad_norm": 0.1962890625, "learning_rate": 0.00017879156976311234, "loss": 2.1449, "step": 3425 }, { "epoch": 0.58, "grad_norm": 0.2041015625, "learning_rate": 0.000178700463692332, "loss": 2.1682, "step": 3430 }, { "epoch": 0.58, "grad_norm": 0.203125, "learning_rate": 0.00017860918567415496, "loss": 2.1207, "step": 3435 }, { "epoch": 0.58, "grad_norm": 0.2001953125, "learning_rate": 0.00017851773590800844, "loss": 2.1677, "step": 3440 }, { "epoch": 0.58, "grad_norm": 0.2001953125, "learning_rate": 0.00017842611459369497, "loss": 2.1592, "step": 3445 }, { "epoch": 0.58, "grad_norm": 0.2060546875, "learning_rate": 0.0001783343219313918, "loss": 2.1685, "step": 3450 }, { "epoch": 0.59, "grad_norm": 0.228515625, "learning_rate": 0.0001782423581216507, "loss": 2.1355, "step": 3455 }, { "epoch": 0.59, "grad_norm": 0.20703125, "learning_rate": 0.00017815022336539716, "loss": 2.1463, "step": 3460 }, { "epoch": 0.59, "grad_norm": 0.1982421875, "learning_rate": 0.00017805791786393028, "loss": 2.1927, "step": 3465 }, { "epoch": 0.59, "grad_norm": 0.201171875, "learning_rate": 0.00017796544181892228, "loss": 2.1393, "step": 3470 }, { "epoch": 0.59, "grad_norm": 0.2041015625, "learning_rate": 0.00017787279543241783, "loss": 2.1723, "step": 3475 }, { "epoch": 0.59, "grad_norm": 0.2001953125, "learning_rate": 0.00017777997890683385, "loss": 2.1761, "step": 3480 }, { "epoch": 0.59, "grad_norm": 0.2021484375, "learning_rate": 0.00017768699244495904, "loss": 2.1744, "step": 3485 }, { "epoch": 0.59, "grad_norm": 0.205078125, "learning_rate": 0.00017759383624995321, "loss": 2.1923, "step": 3490 }, { "epoch": 0.59, "grad_norm": 0.1962890625, "learning_rate": 0.00017750051052534724, "loss": 2.1148, "step": 3495 }, { "epoch": 0.59, "grad_norm": 0.201171875, "learning_rate": 0.0001774070154750422, "loss": 2.1625, "step": 3500 }, { "epoch": 0.59, "grad_norm": 0.205078125, "learning_rate": 0.00017731335130330927, "loss": 2.1456, "step": 3505 }, { "epoch": 0.59, "grad_norm": 0.1953125, "learning_rate": 0.00017721951821478898, "loss": 2.1667, "step": 3510 }, { "epoch": 0.6, "grad_norm": 0.2001953125, "learning_rate": 0.00017712551641449099, "loss": 2.2208, "step": 3515 }, { "epoch": 0.6, "grad_norm": 0.1953125, "learning_rate": 0.00017703134610779362, "loss": 2.1765, "step": 3520 }, { "epoch": 0.6, "grad_norm": 0.2001953125, "learning_rate": 0.00017693700750044328, "loss": 2.176, "step": 3525 }, { "epoch": 0.6, "grad_norm": 0.1953125, "learning_rate": 0.0001768425007985541, "loss": 2.1524, "step": 3530 }, { "epoch": 0.6, "grad_norm": 0.197265625, "learning_rate": 0.00017674782620860744, "loss": 2.1427, "step": 3535 }, { "epoch": 0.6, "grad_norm": 0.1943359375, "learning_rate": 0.00017665298393745152, "loss": 2.1892, "step": 3540 }, { "epoch": 0.6, "grad_norm": 0.197265625, "learning_rate": 0.00017655797419230095, "loss": 2.1542, "step": 3545 }, { "epoch": 0.6, "grad_norm": 0.203125, "learning_rate": 0.00017646279718073611, "loss": 2.1891, "step": 3550 }, { "epoch": 0.6, "grad_norm": 0.2109375, "learning_rate": 0.00017636745311070296, "loss": 2.1905, "step": 3555 }, { "epoch": 0.6, "grad_norm": 0.197265625, "learning_rate": 0.00017627194219051238, "loss": 2.164, "step": 3560 }, { "epoch": 0.6, "grad_norm": 0.19921875, "learning_rate": 0.0001761762646288398, "loss": 2.1426, "step": 3565 }, { "epoch": 0.6, "grad_norm": 0.2060546875, "learning_rate": 0.0001760804206347248, "loss": 2.1479, "step": 3570 }, { "epoch": 0.61, "grad_norm": 0.205078125, "learning_rate": 0.00017598441041757047, "loss": 2.1541, "step": 3575 }, { "epoch": 0.61, "grad_norm": 0.19921875, "learning_rate": 0.00017588823418714314, "loss": 2.1498, "step": 3580 }, { "epoch": 0.61, "grad_norm": 0.205078125, "learning_rate": 0.00017579189215357187, "loss": 2.1466, "step": 3585 }, { "epoch": 0.61, "grad_norm": 0.203125, "learning_rate": 0.00017569538452734797, "loss": 2.1702, "step": 3590 }, { "epoch": 0.61, "grad_norm": 0.2001953125, "learning_rate": 0.00017559871151932448, "loss": 2.1822, "step": 3595 }, { "epoch": 0.61, "grad_norm": 0.201171875, "learning_rate": 0.0001755018733407158, "loss": 2.1712, "step": 3600 }, { "epoch": 0.61, "grad_norm": 0.19921875, "learning_rate": 0.00017540487020309726, "loss": 2.1469, "step": 3605 }, { "epoch": 0.61, "grad_norm": 0.20703125, "learning_rate": 0.0001753077023184045, "loss": 2.1792, "step": 3610 }, { "epoch": 0.61, "grad_norm": 0.19921875, "learning_rate": 0.00017521036989893318, "loss": 2.1561, "step": 3615 }, { "epoch": 0.61, "grad_norm": 0.203125, "learning_rate": 0.00017511287315733837, "loss": 2.164, "step": 3620 }, { "epoch": 0.61, "grad_norm": 0.2021484375, "learning_rate": 0.00017501521230663429, "loss": 2.1957, "step": 3625 }, { "epoch": 0.61, "grad_norm": 0.2001953125, "learning_rate": 0.00017491738756019357, "loss": 2.147, "step": 3630 }, { "epoch": 0.62, "grad_norm": 0.197265625, "learning_rate": 0.00017481939913174696, "loss": 2.1521, "step": 3635 }, { "epoch": 0.62, "grad_norm": 0.21875, "learning_rate": 0.00017472124723538288, "loss": 2.1614, "step": 3640 }, { "epoch": 0.62, "grad_norm": 0.203125, "learning_rate": 0.00017462293208554683, "loss": 2.1617, "step": 3645 }, { "epoch": 0.62, "grad_norm": 0.2080078125, "learning_rate": 0.00017452445389704106, "loss": 2.1826, "step": 3650 }, { "epoch": 0.62, "grad_norm": 0.2041015625, "learning_rate": 0.00017442581288502397, "loss": 2.1398, "step": 3655 }, { "epoch": 0.62, "grad_norm": 0.205078125, "learning_rate": 0.00017432700926500977, "loss": 2.1396, "step": 3660 }, { "epoch": 0.62, "grad_norm": 0.20703125, "learning_rate": 0.00017422804325286788, "loss": 2.133, "step": 3665 }, { "epoch": 0.62, "grad_norm": 0.2041015625, "learning_rate": 0.0001741289150648225, "loss": 2.2083, "step": 3670 }, { "epoch": 0.62, "grad_norm": 0.2080078125, "learning_rate": 0.00017402962491745228, "loss": 2.1073, "step": 3675 }, { "epoch": 0.62, "grad_norm": 0.205078125, "learning_rate": 0.00017393017302768963, "loss": 2.2294, "step": 3680 }, { "epoch": 0.62, "grad_norm": 0.1982421875, "learning_rate": 0.00017383055961282028, "loss": 2.1668, "step": 3685 }, { "epoch": 0.62, "grad_norm": 0.203125, "learning_rate": 0.00017373078489048302, "loss": 2.1639, "step": 3690 }, { "epoch": 0.63, "grad_norm": 0.19140625, "learning_rate": 0.00017363084907866895, "loss": 2.1752, "step": 3695 }, { "epoch": 0.63, "grad_norm": 0.201171875, "learning_rate": 0.00017353075239572117, "loss": 2.1609, "step": 3700 }, { "epoch": 0.63, "grad_norm": 0.2021484375, "learning_rate": 0.00017343049506033425, "loss": 2.1996, "step": 3705 }, { "epoch": 0.63, "grad_norm": 0.1982421875, "learning_rate": 0.00017333007729155377, "loss": 2.1319, "step": 3710 }, { "epoch": 0.63, "grad_norm": 0.208984375, "learning_rate": 0.00017322949930877583, "loss": 2.1314, "step": 3715 }, { "epoch": 0.63, "grad_norm": 0.2021484375, "learning_rate": 0.00017312876133174655, "loss": 2.1361, "step": 3720 }, { "epoch": 0.63, "grad_norm": 0.2041015625, "learning_rate": 0.00017302786358056155, "loss": 2.1442, "step": 3725 }, { "epoch": 0.63, "grad_norm": 0.2001953125, "learning_rate": 0.00017292680627566568, "loss": 2.1606, "step": 3730 }, { "epoch": 0.63, "grad_norm": 0.2021484375, "learning_rate": 0.00017282558963785234, "loss": 2.1756, "step": 3735 }, { "epoch": 0.63, "grad_norm": 0.2099609375, "learning_rate": 0.0001727242138882629, "loss": 2.1586, "step": 3740 }, { "epoch": 0.63, "grad_norm": 0.2001953125, "learning_rate": 0.00017262267924838658, "loss": 2.148, "step": 3745 }, { "epoch": 0.64, "grad_norm": 0.1982421875, "learning_rate": 0.0001725209859400596, "loss": 2.1351, "step": 3750 }, { "epoch": 0.64, "grad_norm": 0.2041015625, "learning_rate": 0.0001724191341854649, "loss": 2.1536, "step": 3755 }, { "epoch": 0.64, "grad_norm": 0.2001953125, "learning_rate": 0.00017231712420713157, "loss": 2.15, "step": 3760 }, { "epoch": 0.64, "grad_norm": 0.197265625, "learning_rate": 0.00017221495622793444, "loss": 2.1419, "step": 3765 }, { "epoch": 0.64, "grad_norm": 0.19921875, "learning_rate": 0.00017211263047109353, "loss": 2.1394, "step": 3770 }, { "epoch": 0.64, "grad_norm": 0.208984375, "learning_rate": 0.00017201014716017348, "loss": 2.2241, "step": 3775 }, { "epoch": 0.64, "grad_norm": 0.1982421875, "learning_rate": 0.00017190750651908336, "loss": 2.1367, "step": 3780 }, { "epoch": 0.64, "grad_norm": 0.1982421875, "learning_rate": 0.00017180470877207576, "loss": 2.1249, "step": 3785 }, { "epoch": 0.64, "grad_norm": 0.2021484375, "learning_rate": 0.0001717017541437467, "loss": 2.1624, "step": 3790 }, { "epoch": 0.64, "grad_norm": 0.1943359375, "learning_rate": 0.00017159864285903488, "loss": 2.1353, "step": 3795 }, { "epoch": 0.64, "grad_norm": 0.2001953125, "learning_rate": 0.00017149537514322123, "loss": 2.1452, "step": 3800 }, { "epoch": 0.64, "grad_norm": 0.201171875, "learning_rate": 0.0001713919512219285, "loss": 2.1418, "step": 3805 }, { "epoch": 0.65, "grad_norm": 0.2060546875, "learning_rate": 0.00017128837132112076, "loss": 2.1446, "step": 3810 }, { "epoch": 0.65, "grad_norm": 0.2041015625, "learning_rate": 0.00017118463566710284, "loss": 2.1525, "step": 3815 }, { "epoch": 0.65, "grad_norm": 0.2041015625, "learning_rate": 0.00017108074448651976, "loss": 2.1617, "step": 3820 }, { "epoch": 0.65, "grad_norm": 0.201171875, "learning_rate": 0.00017097669800635653, "loss": 2.1856, "step": 3825 }, { "epoch": 0.65, "grad_norm": 0.205078125, "learning_rate": 0.00017087249645393734, "loss": 2.1432, "step": 3830 }, { "epoch": 0.65, "grad_norm": 0.2041015625, "learning_rate": 0.00017076814005692522, "loss": 2.1366, "step": 3835 }, { "epoch": 0.65, "grad_norm": 0.203125, "learning_rate": 0.0001706636290433215, "loss": 2.1779, "step": 3840 }, { "epoch": 0.65, "grad_norm": 0.2001953125, "learning_rate": 0.00017055896364146528, "loss": 2.1282, "step": 3845 }, { "epoch": 0.65, "grad_norm": 0.197265625, "learning_rate": 0.00017045414408003312, "loss": 2.1785, "step": 3850 }, { "epoch": 0.65, "grad_norm": 0.2080078125, "learning_rate": 0.00017034917058803822, "loss": 2.1825, "step": 3855 }, { "epoch": 0.65, "grad_norm": 0.19921875, "learning_rate": 0.00017024404339483016, "loss": 2.1743, "step": 3860 }, { "epoch": 0.65, "grad_norm": 0.201171875, "learning_rate": 0.00017013876273009438, "loss": 2.1668, "step": 3865 }, { "epoch": 0.66, "grad_norm": 0.197265625, "learning_rate": 0.00017003332882385155, "loss": 2.1435, "step": 3870 }, { "epoch": 0.66, "grad_norm": 0.1962890625, "learning_rate": 0.0001699277419064572, "loss": 2.1453, "step": 3875 }, { "epoch": 0.66, "grad_norm": 0.201171875, "learning_rate": 0.00016982200220860114, "loss": 2.1617, "step": 3880 }, { "epoch": 0.66, "grad_norm": 0.197265625, "learning_rate": 0.00016971610996130703, "loss": 2.1807, "step": 3885 }, { "epoch": 0.66, "grad_norm": 0.201171875, "learning_rate": 0.0001696100653959317, "loss": 2.1619, "step": 3890 }, { "epoch": 0.66, "grad_norm": 0.203125, "learning_rate": 0.0001695038687441649, "loss": 2.1842, "step": 3895 }, { "epoch": 0.66, "grad_norm": 0.1962890625, "learning_rate": 0.0001693975202380286, "loss": 2.1851, "step": 3900 }, { "epoch": 0.66, "grad_norm": 0.2099609375, "learning_rate": 0.0001692910201098766, "loss": 2.1786, "step": 3905 }, { "epoch": 0.66, "grad_norm": 0.1982421875, "learning_rate": 0.00016918436859239387, "loss": 2.1408, "step": 3910 }, { "epoch": 0.66, "grad_norm": 0.205078125, "learning_rate": 0.00016907756591859628, "loss": 2.1447, "step": 3915 }, { "epoch": 0.66, "grad_norm": 0.2041015625, "learning_rate": 0.00016897061232182977, "loss": 2.1707, "step": 3920 }, { "epoch": 0.66, "grad_norm": 0.205078125, "learning_rate": 0.0001688635080357702, "loss": 2.1928, "step": 3925 }, { "epoch": 0.67, "grad_norm": 0.2177734375, "learning_rate": 0.00016875625329442257, "loss": 2.169, "step": 3930 }, { "epoch": 0.67, "grad_norm": 0.2001953125, "learning_rate": 0.0001686488483321206, "loss": 2.1886, "step": 3935 }, { "epoch": 0.67, "grad_norm": 0.205078125, "learning_rate": 0.00016854129338352624, "loss": 2.2059, "step": 3940 }, { "epoch": 0.67, "grad_norm": 0.2060546875, "learning_rate": 0.00016843358868362914, "loss": 2.1918, "step": 3945 }, { "epoch": 0.67, "grad_norm": 0.1962890625, "learning_rate": 0.0001683257344677461, "loss": 2.1218, "step": 3950 }, { "epoch": 0.67, "grad_norm": 0.205078125, "learning_rate": 0.0001682177309715206, "loss": 2.1635, "step": 3955 }, { "epoch": 0.67, "grad_norm": 0.2099609375, "learning_rate": 0.0001681095784309223, "loss": 2.1986, "step": 3960 }, { "epoch": 0.67, "grad_norm": 0.19921875, "learning_rate": 0.00016800127708224648, "loss": 2.101, "step": 3965 }, { "epoch": 0.67, "grad_norm": 0.197265625, "learning_rate": 0.0001678928271621135, "loss": 2.1343, "step": 3970 }, { "epoch": 0.67, "grad_norm": 0.205078125, "learning_rate": 0.0001677842289074684, "loss": 2.1897, "step": 3975 }, { "epoch": 0.67, "grad_norm": 0.19921875, "learning_rate": 0.00016767548255558023, "loss": 2.1349, "step": 3980 }, { "epoch": 0.67, "grad_norm": 0.2109375, "learning_rate": 0.0001675665883440417, "loss": 2.1386, "step": 3985 }, { "epoch": 0.68, "grad_norm": 0.2021484375, "learning_rate": 0.00016745754651076848, "loss": 2.1427, "step": 3990 }, { "epoch": 0.68, "grad_norm": 0.2099609375, "learning_rate": 0.00016734835729399877, "loss": 2.1637, "step": 3995 }, { "epoch": 0.68, "grad_norm": 0.2099609375, "learning_rate": 0.0001672390209322929, "loss": 2.1205, "step": 4000 }, { "epoch": 0.68, "grad_norm": 0.203125, "learning_rate": 0.0001671295376645325, "loss": 2.1365, "step": 4005 }, { "epoch": 0.68, "grad_norm": 0.205078125, "learning_rate": 0.0001670199077299203, "loss": 2.1636, "step": 4010 }, { "epoch": 0.68, "grad_norm": 0.2021484375, "learning_rate": 0.00016691013136797947, "loss": 2.1305, "step": 4015 }, { "epoch": 0.68, "grad_norm": 0.2041015625, "learning_rate": 0.00016680020881855301, "loss": 2.1441, "step": 4020 }, { "epoch": 0.68, "grad_norm": 0.205078125, "learning_rate": 0.0001666901403218034, "loss": 2.1418, "step": 4025 }, { "epoch": 0.68, "grad_norm": 0.197265625, "learning_rate": 0.000166579926118212, "loss": 2.15, "step": 4030 }, { "epoch": 0.68, "grad_norm": 0.205078125, "learning_rate": 0.00016646956644857837, "loss": 2.1799, "step": 4035 }, { "epoch": 0.68, "grad_norm": 0.2119140625, "learning_rate": 0.0001663590615540201, "loss": 2.1624, "step": 4040 }, { "epoch": 0.69, "grad_norm": 0.205078125, "learning_rate": 0.00016624841167597193, "loss": 2.1176, "step": 4045 }, { "epoch": 0.69, "grad_norm": 0.203125, "learning_rate": 0.00016613761705618538, "loss": 2.1614, "step": 4050 }, { "epoch": 0.69, "grad_norm": 0.203125, "learning_rate": 0.0001660266779367283, "loss": 2.1349, "step": 4055 }, { "epoch": 0.69, "grad_norm": 0.208984375, "learning_rate": 0.00016591559455998408, "loss": 2.1229, "step": 4060 }, { "epoch": 0.69, "grad_norm": 0.203125, "learning_rate": 0.0001658043671686514, "loss": 2.1506, "step": 4065 }, { "epoch": 0.69, "grad_norm": 0.208984375, "learning_rate": 0.00016569299600574365, "loss": 2.1233, "step": 4070 }, { "epoch": 0.69, "grad_norm": 0.2021484375, "learning_rate": 0.0001655814813145882, "loss": 2.151, "step": 4075 }, { "epoch": 0.69, "grad_norm": 0.2119140625, "learning_rate": 0.00016546982333882608, "loss": 2.1246, "step": 4080 }, { "epoch": 0.69, "grad_norm": 0.2041015625, "learning_rate": 0.00016535802232241133, "loss": 2.1752, "step": 4085 }, { "epoch": 0.69, "grad_norm": 0.203125, "learning_rate": 0.0001652460785096106, "loss": 2.1706, "step": 4090 }, { "epoch": 0.69, "grad_norm": 0.2021484375, "learning_rate": 0.0001651339921450024, "loss": 2.1159, "step": 4095 }, { "epoch": 0.69, "grad_norm": 0.21484375, "learning_rate": 0.0001650217634734768, "loss": 2.1378, "step": 4100 }, { "epoch": 0.7, "grad_norm": 0.2060546875, "learning_rate": 0.0001649093927402347, "loss": 2.1695, "step": 4105 }, { "epoch": 0.7, "grad_norm": 0.208984375, "learning_rate": 0.00016479688019078748, "loss": 2.1548, "step": 4110 }, { "epoch": 0.7, "grad_norm": 0.201171875, "learning_rate": 0.00016468422607095626, "loss": 2.1457, "step": 4115 }, { "epoch": 0.7, "grad_norm": 0.2001953125, "learning_rate": 0.00016457143062687153, "loss": 2.1345, "step": 4120 }, { "epoch": 0.7, "grad_norm": 0.2021484375, "learning_rate": 0.00016445849410497257, "loss": 2.1505, "step": 4125 }, { "epoch": 0.7, "grad_norm": 0.2060546875, "learning_rate": 0.00016434541675200678, "loss": 2.1584, "step": 4130 }, { "epoch": 0.7, "grad_norm": 0.20703125, "learning_rate": 0.00016423219881502946, "loss": 2.1538, "step": 4135 }, { "epoch": 0.7, "grad_norm": 0.19921875, "learning_rate": 0.00016411884054140277, "loss": 2.1481, "step": 4140 }, { "epoch": 0.7, "grad_norm": 0.2080078125, "learning_rate": 0.00016400534217879574, "loss": 2.1452, "step": 4145 }, { "epoch": 0.7, "grad_norm": 0.208984375, "learning_rate": 0.0001638917039751834, "loss": 2.1736, "step": 4150 }, { "epoch": 0.7, "grad_norm": 0.2021484375, "learning_rate": 0.00016377792617884625, "loss": 2.1551, "step": 4155 }, { "epoch": 0.7, "grad_norm": 0.2021484375, "learning_rate": 0.0001636640090383698, "loss": 2.1443, "step": 4160 }, { "epoch": 0.71, "grad_norm": 0.2080078125, "learning_rate": 0.00016354995280264402, "loss": 2.1875, "step": 4165 }, { "epoch": 0.71, "grad_norm": 0.2021484375, "learning_rate": 0.0001634357577208628, "loss": 2.1689, "step": 4170 }, { "epoch": 0.71, "grad_norm": 0.2001953125, "learning_rate": 0.00016332142404252333, "loss": 2.1903, "step": 4175 }, { "epoch": 0.71, "grad_norm": 0.2041015625, "learning_rate": 0.00016320695201742566, "loss": 2.1529, "step": 4180 }, { "epoch": 0.71, "grad_norm": 0.201171875, "learning_rate": 0.0001630923418956721, "loss": 2.1493, "step": 4185 }, { "epoch": 0.71, "grad_norm": 0.20703125, "learning_rate": 0.00016297759392766667, "loss": 2.1718, "step": 4190 }, { "epoch": 0.71, "grad_norm": 0.20703125, "learning_rate": 0.0001628627083641145, "loss": 2.155, "step": 4195 }, { "epoch": 0.71, "grad_norm": 0.1953125, "learning_rate": 0.00016274768545602143, "loss": 2.1576, "step": 4200 }, { "epoch": 0.71, "grad_norm": 0.205078125, "learning_rate": 0.00016263252545469338, "loss": 2.1408, "step": 4205 }, { "epoch": 0.71, "grad_norm": 0.20703125, "learning_rate": 0.0001625172286117357, "loss": 2.1832, "step": 4210 }, { "epoch": 0.71, "grad_norm": 0.203125, "learning_rate": 0.00016240179517905282, "loss": 2.1728, "step": 4215 }, { "epoch": 0.71, "grad_norm": 0.2001953125, "learning_rate": 0.00016228622540884755, "loss": 2.1633, "step": 4220 }, { "epoch": 0.72, "grad_norm": 0.2060546875, "learning_rate": 0.00016217051955362056, "loss": 2.1659, "step": 4225 }, { "epoch": 0.72, "grad_norm": 0.203125, "learning_rate": 0.00016205467786616984, "loss": 2.174, "step": 4230 }, { "epoch": 0.72, "grad_norm": 0.2138671875, "learning_rate": 0.0001619387005995902, "loss": 2.2027, "step": 4235 }, { "epoch": 0.72, "grad_norm": 0.2080078125, "learning_rate": 0.00016182258800727267, "loss": 2.1338, "step": 4240 }, { "epoch": 0.72, "grad_norm": 0.2080078125, "learning_rate": 0.00016170634034290383, "loss": 2.1211, "step": 4245 }, { "epoch": 0.72, "grad_norm": 0.2021484375, "learning_rate": 0.00016158995786046552, "loss": 2.1571, "step": 4250 }, { "epoch": 0.72, "grad_norm": 0.2041015625, "learning_rate": 0.00016147344081423402, "loss": 2.1354, "step": 4255 }, { "epoch": 0.72, "grad_norm": 0.2080078125, "learning_rate": 0.0001613567894587797, "loss": 2.2071, "step": 4260 }, { "epoch": 0.72, "grad_norm": 0.20703125, "learning_rate": 0.00016124000404896632, "loss": 2.1344, "step": 4265 }, { "epoch": 0.72, "grad_norm": 0.2060546875, "learning_rate": 0.00016112308483995052, "loss": 2.1864, "step": 4270 }, { "epoch": 0.72, "grad_norm": 0.203125, "learning_rate": 0.0001610060320871813, "loss": 2.1774, "step": 4275 }, { "epoch": 0.72, "grad_norm": 0.2001953125, "learning_rate": 0.0001608888460463994, "loss": 2.1568, "step": 4280 }, { "epoch": 0.73, "grad_norm": 0.2060546875, "learning_rate": 0.0001607715269736368, "loss": 2.1271, "step": 4285 }, { "epoch": 0.73, "grad_norm": 0.203125, "learning_rate": 0.00016065407512521612, "loss": 2.1558, "step": 4290 }, { "epoch": 0.73, "grad_norm": 0.2080078125, "learning_rate": 0.0001605364907577501, "loss": 2.1547, "step": 4295 }, { "epoch": 0.73, "grad_norm": 0.2080078125, "learning_rate": 0.00016041877412814094, "loss": 2.1729, "step": 4300 }, { "epoch": 0.73, "grad_norm": 0.2021484375, "learning_rate": 0.00016030092549357988, "loss": 2.191, "step": 4305 }, { "epoch": 0.73, "grad_norm": 0.2001953125, "learning_rate": 0.00016018294511154654, "loss": 2.1488, "step": 4310 }, { "epoch": 0.73, "grad_norm": 0.203125, "learning_rate": 0.00016006483323980844, "loss": 2.1452, "step": 4315 }, { "epoch": 0.73, "grad_norm": 0.205078125, "learning_rate": 0.0001599465901364202, "loss": 2.1807, "step": 4320 }, { "epoch": 0.73, "grad_norm": 0.2041015625, "learning_rate": 0.00015982821605972346, "loss": 2.1537, "step": 4325 }, { "epoch": 0.73, "grad_norm": 0.1982421875, "learning_rate": 0.00015970971126834575, "loss": 2.1796, "step": 4330 }, { "epoch": 0.73, "grad_norm": 0.205078125, "learning_rate": 0.00015959107602120032, "loss": 2.1339, "step": 4335 }, { "epoch": 0.74, "grad_norm": 0.2021484375, "learning_rate": 0.00015947231057748535, "loss": 2.1562, "step": 4340 }, { "epoch": 0.74, "grad_norm": 0.2119140625, "learning_rate": 0.00015935341519668367, "loss": 2.1585, "step": 4345 }, { "epoch": 0.74, "grad_norm": 0.203125, "learning_rate": 0.00015923439013856174, "loss": 2.1133, "step": 4350 }, { "epoch": 0.74, "grad_norm": 0.2001953125, "learning_rate": 0.00015911523566316954, "loss": 2.176, "step": 4355 }, { "epoch": 0.74, "grad_norm": 0.2041015625, "learning_rate": 0.00015899595203083976, "loss": 2.1195, "step": 4360 }, { "epoch": 0.74, "grad_norm": 0.2021484375, "learning_rate": 0.00015887653950218722, "loss": 2.1538, "step": 4365 }, { "epoch": 0.74, "grad_norm": 0.2060546875, "learning_rate": 0.00015875699833810839, "loss": 2.1617, "step": 4370 }, { "epoch": 0.74, "grad_norm": 0.2021484375, "learning_rate": 0.00015863732879978082, "loss": 2.1945, "step": 4375 }, { "epoch": 0.74, "grad_norm": 0.2158203125, "learning_rate": 0.00015851753114866251, "loss": 2.1321, "step": 4380 }, { "epoch": 0.74, "grad_norm": 0.208984375, "learning_rate": 0.0001583976056464913, "loss": 2.1336, "step": 4385 }, { "epoch": 0.74, "grad_norm": 0.2041015625, "learning_rate": 0.00015827755255528448, "loss": 2.1547, "step": 4390 }, { "epoch": 0.74, "grad_norm": 0.205078125, "learning_rate": 0.000158157372137338, "loss": 2.1544, "step": 4395 }, { "epoch": 0.75, "grad_norm": 0.203125, "learning_rate": 0.00015803706465522614, "loss": 2.1145, "step": 4400 }, { "epoch": 0.75, "grad_norm": 0.205078125, "learning_rate": 0.00015791663037180057, "loss": 2.1527, "step": 4405 }, { "epoch": 0.75, "grad_norm": 0.208984375, "learning_rate": 0.0001577960695501902, "loss": 2.1787, "step": 4410 }, { "epoch": 0.75, "grad_norm": 0.205078125, "learning_rate": 0.00015767538245380037, "loss": 2.1749, "step": 4415 }, { "epoch": 0.75, "grad_norm": 0.2041015625, "learning_rate": 0.00015755456934631222, "loss": 2.1307, "step": 4420 }, { "epoch": 0.75, "grad_norm": 0.2021484375, "learning_rate": 0.00015743363049168223, "loss": 2.1711, "step": 4425 }, { "epoch": 0.75, "grad_norm": 0.201171875, "learning_rate": 0.00015731256615414166, "loss": 2.1446, "step": 4430 }, { "epoch": 0.75, "grad_norm": 0.2041015625, "learning_rate": 0.00015719137659819593, "loss": 2.1615, "step": 4435 }, { "epoch": 0.75, "grad_norm": 0.203125, "learning_rate": 0.00015707006208862402, "loss": 2.1711, "step": 4440 }, { "epoch": 0.75, "grad_norm": 0.20703125, "learning_rate": 0.0001569486228904779, "loss": 2.1336, "step": 4445 }, { "epoch": 0.75, "grad_norm": 0.2099609375, "learning_rate": 0.000156827059269082, "loss": 2.1397, "step": 4450 }, { "epoch": 0.75, "grad_norm": 0.2041015625, "learning_rate": 0.00015670537149003257, "loss": 2.1769, "step": 4455 }, { "epoch": 0.76, "grad_norm": 0.212890625, "learning_rate": 0.0001565835598191971, "loss": 2.14, "step": 4460 }, { "epoch": 0.76, "grad_norm": 0.208984375, "learning_rate": 0.00015646162452271378, "loss": 2.1609, "step": 4465 }, { "epoch": 0.76, "grad_norm": 0.208984375, "learning_rate": 0.00015633956586699096, "loss": 2.1562, "step": 4470 }, { "epoch": 0.76, "grad_norm": 0.2021484375, "learning_rate": 0.00015621738411870643, "loss": 2.1282, "step": 4475 }, { "epoch": 0.76, "grad_norm": 0.20703125, "learning_rate": 0.00015609507954480697, "loss": 2.1813, "step": 4480 }, { "epoch": 0.76, "grad_norm": 0.216796875, "learning_rate": 0.00015597265241250763, "loss": 2.1393, "step": 4485 }, { "epoch": 0.76, "grad_norm": 0.208984375, "learning_rate": 0.00015585010298929138, "loss": 2.1257, "step": 4490 }, { "epoch": 0.76, "grad_norm": 0.203125, "learning_rate": 0.00015572743154290824, "loss": 2.1303, "step": 4495 }, { "epoch": 0.76, "grad_norm": 0.2041015625, "learning_rate": 0.00015560463834137482, "loss": 2.1328, "step": 4500 }, { "epoch": 0.76, "grad_norm": 0.2041015625, "learning_rate": 0.0001554817236529739, "loss": 2.1419, "step": 4505 }, { "epoch": 0.76, "grad_norm": 0.2021484375, "learning_rate": 0.00015535868774625353, "loss": 2.1534, "step": 4510 }, { "epoch": 0.76, "grad_norm": 0.2041015625, "learning_rate": 0.00015523553089002667, "loss": 2.1393, "step": 4515 }, { "epoch": 0.77, "grad_norm": 0.2080078125, "learning_rate": 0.0001551122533533705, "loss": 2.145, "step": 4520 }, { "epoch": 0.77, "grad_norm": 0.2109375, "learning_rate": 0.00015498885540562597, "loss": 2.1604, "step": 4525 }, { "epoch": 0.77, "grad_norm": 0.1982421875, "learning_rate": 0.000154865337316397, "loss": 2.137, "step": 4530 }, { "epoch": 0.77, "grad_norm": 0.2099609375, "learning_rate": 0.00015474169935554994, "loss": 2.1242, "step": 4535 }, { "epoch": 0.77, "grad_norm": 0.208984375, "learning_rate": 0.00015461794179321323, "loss": 2.2208, "step": 4540 }, { "epoch": 0.77, "grad_norm": 0.203125, "learning_rate": 0.0001544940648997765, "loss": 2.1613, "step": 4545 }, { "epoch": 0.77, "grad_norm": 0.20703125, "learning_rate": 0.00015437006894589007, "loss": 2.1307, "step": 4550 }, { "epoch": 0.77, "grad_norm": 0.205078125, "learning_rate": 0.00015424595420246448, "loss": 2.1235, "step": 4555 }, { "epoch": 0.77, "grad_norm": 0.203125, "learning_rate": 0.00015412172094066975, "loss": 2.1515, "step": 4560 }, { "epoch": 0.77, "grad_norm": 0.2041015625, "learning_rate": 0.00015399736943193487, "loss": 2.1534, "step": 4565 }, { "epoch": 0.77, "grad_norm": 0.203125, "learning_rate": 0.0001538728999479471, "loss": 2.1222, "step": 4570 }, { "epoch": 0.77, "grad_norm": 0.201171875, "learning_rate": 0.00015374831276065157, "loss": 2.2067, "step": 4575 }, { "epoch": 0.78, "grad_norm": 0.2021484375, "learning_rate": 0.0001536236081422505, "loss": 2.1823, "step": 4580 }, { "epoch": 0.78, "grad_norm": 0.203125, "learning_rate": 0.00015349878636520273, "loss": 2.1067, "step": 4585 }, { "epoch": 0.78, "grad_norm": 0.20703125, "learning_rate": 0.00015337384770222295, "loss": 2.1536, "step": 4590 }, { "epoch": 0.78, "grad_norm": 0.2080078125, "learning_rate": 0.00015324879242628145, "loss": 2.149, "step": 4595 }, { "epoch": 0.78, "grad_norm": 0.208984375, "learning_rate": 0.00015312362081060308, "loss": 2.1436, "step": 4600 }, { "epoch": 0.78, "grad_norm": 0.2080078125, "learning_rate": 0.00015299833312866696, "loss": 2.1826, "step": 4605 }, { "epoch": 0.78, "grad_norm": 0.19921875, "learning_rate": 0.0001528729296542058, "loss": 2.1255, "step": 4610 }, { "epoch": 0.78, "grad_norm": 0.2021484375, "learning_rate": 0.00015274741066120535, "loss": 2.1766, "step": 4615 }, { "epoch": 0.78, "grad_norm": 0.2197265625, "learning_rate": 0.0001526217764239036, "loss": 2.1777, "step": 4620 }, { "epoch": 0.78, "grad_norm": 0.201171875, "learning_rate": 0.00015249602721679047, "loss": 2.1478, "step": 4625 }, { "epoch": 0.78, "grad_norm": 0.2109375, "learning_rate": 0.00015237016331460702, "loss": 2.1484, "step": 4630 }, { "epoch": 0.78, "grad_norm": 0.2001953125, "learning_rate": 0.00015224418499234488, "loss": 2.1994, "step": 4635 }, { "epoch": 0.79, "grad_norm": 0.2041015625, "learning_rate": 0.00015211809252524568, "loss": 2.1161, "step": 4640 }, { "epoch": 0.79, "grad_norm": 0.232421875, "learning_rate": 0.00015199188618880049, "loss": 2.1493, "step": 4645 }, { "epoch": 0.79, "grad_norm": 0.208984375, "learning_rate": 0.0001518655662587491, "loss": 2.1431, "step": 4650 }, { "epoch": 0.79, "grad_norm": 0.203125, "learning_rate": 0.0001517391330110795, "loss": 2.1434, "step": 4655 }, { "epoch": 0.79, "grad_norm": 0.2080078125, "learning_rate": 0.00015161258672202724, "loss": 2.1252, "step": 4660 }, { "epoch": 0.79, "grad_norm": 0.203125, "learning_rate": 0.0001514859276680749, "loss": 2.1591, "step": 4665 }, { "epoch": 0.79, "grad_norm": 0.1982421875, "learning_rate": 0.0001513591561259514, "loss": 2.1137, "step": 4670 }, { "epoch": 0.79, "grad_norm": 0.2060546875, "learning_rate": 0.00015123227237263148, "loss": 2.1636, "step": 4675 }, { "epoch": 0.79, "grad_norm": 0.2080078125, "learning_rate": 0.00015110527668533486, "loss": 2.1489, "step": 4680 }, { "epoch": 0.79, "grad_norm": 0.2060546875, "learning_rate": 0.0001509781693415261, "loss": 2.1301, "step": 4685 }, { "epoch": 0.79, "grad_norm": 0.203125, "learning_rate": 0.00015085095061891348, "loss": 2.1761, "step": 4690 }, { "epoch": 0.8, "grad_norm": 0.203125, "learning_rate": 0.0001507236207954487, "loss": 2.2051, "step": 4695 }, { "epoch": 0.8, "grad_norm": 0.203125, "learning_rate": 0.00015059618014932625, "loss": 2.1436, "step": 4700 }, { "epoch": 0.8, "grad_norm": 0.2119140625, "learning_rate": 0.00015046862895898267, "loss": 2.1323, "step": 4705 }, { "epoch": 0.8, "grad_norm": 0.2060546875, "learning_rate": 0.00015034096750309609, "loss": 2.1837, "step": 4710 }, { "epoch": 0.8, "grad_norm": 0.205078125, "learning_rate": 0.00015021319606058544, "loss": 2.1374, "step": 4715 }, { "epoch": 0.8, "grad_norm": 0.208984375, "learning_rate": 0.00015008531491061012, "loss": 2.1646, "step": 4720 }, { "epoch": 0.8, "grad_norm": 0.21484375, "learning_rate": 0.00014995732433256906, "loss": 2.1726, "step": 4725 }, { "epoch": 0.8, "grad_norm": 0.208984375, "learning_rate": 0.00014982922460610038, "loss": 2.1688, "step": 4730 }, { "epoch": 0.8, "grad_norm": 0.205078125, "learning_rate": 0.00014970101601108059, "loss": 2.1733, "step": 4735 }, { "epoch": 0.8, "grad_norm": 0.2109375, "learning_rate": 0.00014957269882762416, "loss": 2.1179, "step": 4740 }, { "epoch": 0.8, "grad_norm": 0.21484375, "learning_rate": 0.0001494442733360827, "loss": 2.1872, "step": 4745 }, { "epoch": 0.8, "grad_norm": 0.2109375, "learning_rate": 0.00014931573981704453, "loss": 2.1705, "step": 4750 }, { "epoch": 0.81, "grad_norm": 0.2119140625, "learning_rate": 0.00014918709855133396, "loss": 2.1283, "step": 4755 }, { "epoch": 0.81, "grad_norm": 0.208984375, "learning_rate": 0.00014905834982001075, "loss": 2.1372, "step": 4760 }, { "epoch": 0.81, "grad_norm": 0.212890625, "learning_rate": 0.00014892949390436934, "loss": 2.161, "step": 4765 }, { "epoch": 0.81, "grad_norm": 0.2060546875, "learning_rate": 0.0001488005310859385, "loss": 2.1579, "step": 4770 }, { "epoch": 0.81, "grad_norm": 0.212890625, "learning_rate": 0.0001486714616464805, "loss": 2.1786, "step": 4775 }, { "epoch": 0.81, "grad_norm": 0.20703125, "learning_rate": 0.0001485422858679905, "loss": 2.1798, "step": 4780 }, { "epoch": 0.81, "grad_norm": 0.205078125, "learning_rate": 0.0001484130040326961, "loss": 2.1244, "step": 4785 }, { "epoch": 0.81, "grad_norm": 0.2080078125, "learning_rate": 0.0001482836164230565, "loss": 2.1467, "step": 4790 }, { "epoch": 0.81, "grad_norm": 0.205078125, "learning_rate": 0.00014815412332176212, "loss": 2.1469, "step": 4795 }, { "epoch": 0.81, "grad_norm": 0.212890625, "learning_rate": 0.00014802452501173384, "loss": 2.1511, "step": 4800 }, { "epoch": 0.81, "grad_norm": 0.2060546875, "learning_rate": 0.00014789482177612225, "loss": 2.1176, "step": 4805 }, { "epoch": 0.81, "grad_norm": 0.21484375, "learning_rate": 0.00014776501389830737, "loss": 2.1606, "step": 4810 }, { "epoch": 0.82, "grad_norm": 0.212890625, "learning_rate": 0.00014763510166189783, "loss": 2.1423, "step": 4815 }, { "epoch": 0.82, "grad_norm": 0.216796875, "learning_rate": 0.00014750508535073012, "loss": 2.166, "step": 4820 }, { "epoch": 0.82, "grad_norm": 0.2080078125, "learning_rate": 0.00014737496524886828, "loss": 2.1404, "step": 4825 }, { "epoch": 0.82, "grad_norm": 0.2021484375, "learning_rate": 0.00014724474164060298, "loss": 2.1461, "step": 4830 }, { "epoch": 0.82, "grad_norm": 0.2080078125, "learning_rate": 0.00014711441481045115, "loss": 2.1584, "step": 4835 }, { "epoch": 0.82, "grad_norm": 0.2099609375, "learning_rate": 0.00014698398504315522, "loss": 2.1381, "step": 4840 }, { "epoch": 0.82, "grad_norm": 0.2109375, "learning_rate": 0.00014685345262368242, "loss": 2.1385, "step": 4845 }, { "epoch": 0.82, "grad_norm": 0.203125, "learning_rate": 0.00014672281783722438, "loss": 2.187, "step": 4850 }, { "epoch": 0.82, "grad_norm": 0.20703125, "learning_rate": 0.00014659208096919635, "loss": 2.2096, "step": 4855 }, { "epoch": 0.82, "grad_norm": 0.2060546875, "learning_rate": 0.00014646124230523652, "loss": 2.1409, "step": 4860 }, { "epoch": 0.82, "grad_norm": 0.2080078125, "learning_rate": 0.00014633030213120568, "loss": 2.1633, "step": 4865 }, { "epoch": 0.82, "grad_norm": 0.2060546875, "learning_rate": 0.00014619926073318617, "loss": 2.1757, "step": 4870 }, { "epoch": 0.83, "grad_norm": 0.2021484375, "learning_rate": 0.00014606811839748172, "loss": 2.1703, "step": 4875 }, { "epoch": 0.83, "grad_norm": 0.2158203125, "learning_rate": 0.00014593687541061636, "loss": 2.1715, "step": 4880 }, { "epoch": 0.83, "grad_norm": 0.2109375, "learning_rate": 0.00014580553205933422, "loss": 2.1174, "step": 4885 }, { "epoch": 0.83, "grad_norm": 0.2119140625, "learning_rate": 0.00014567408863059864, "loss": 2.1588, "step": 4890 }, { "epoch": 0.83, "grad_norm": 0.20703125, "learning_rate": 0.00014554254541159154, "loss": 2.1533, "step": 4895 }, { "epoch": 0.83, "grad_norm": 0.2109375, "learning_rate": 0.00014541090268971297, "loss": 2.1168, "step": 4900 }, { "epoch": 0.83, "grad_norm": 0.203125, "learning_rate": 0.00014527916075258036, "loss": 2.1413, "step": 4905 }, { "epoch": 0.83, "grad_norm": 0.205078125, "learning_rate": 0.00014514731988802786, "loss": 2.1658, "step": 4910 }, { "epoch": 0.83, "grad_norm": 0.2060546875, "learning_rate": 0.00014501538038410574, "loss": 2.1561, "step": 4915 }, { "epoch": 0.83, "grad_norm": 0.2041015625, "learning_rate": 0.00014488334252907992, "loss": 2.1379, "step": 4920 }, { "epoch": 0.83, "grad_norm": 0.2060546875, "learning_rate": 0.00014475120661143107, "loss": 2.131, "step": 4925 }, { "epoch": 0.83, "grad_norm": 0.19921875, "learning_rate": 0.00014461897291985411, "loss": 2.1684, "step": 4930 }, { "epoch": 0.84, "grad_norm": 0.2109375, "learning_rate": 0.00014448664174325764, "loss": 2.1809, "step": 4935 }, { "epoch": 0.84, "grad_norm": 0.21875, "learning_rate": 0.00014435421337076327, "loss": 2.1414, "step": 4940 }, { "epoch": 0.84, "grad_norm": 0.205078125, "learning_rate": 0.00014422168809170486, "loss": 2.1749, "step": 4945 }, { "epoch": 0.84, "grad_norm": 0.2158203125, "learning_rate": 0.00014408906619562808, "loss": 2.1698, "step": 4950 }, { "epoch": 0.84, "grad_norm": 0.2060546875, "learning_rate": 0.0001439563479722897, "loss": 2.1361, "step": 4955 }, { "epoch": 0.84, "grad_norm": 0.2099609375, "learning_rate": 0.00014382353371165685, "loss": 2.1304, "step": 4960 }, { "epoch": 0.84, "grad_norm": 0.2109375, "learning_rate": 0.00014369062370390667, "loss": 2.1559, "step": 4965 }, { "epoch": 0.84, "grad_norm": 0.208984375, "learning_rate": 0.00014355761823942525, "loss": 2.1343, "step": 4970 }, { "epoch": 0.84, "grad_norm": 0.208984375, "learning_rate": 0.0001434245176088074, "loss": 2.1623, "step": 4975 }, { "epoch": 0.84, "grad_norm": 0.20703125, "learning_rate": 0.00014329132210285587, "loss": 2.1498, "step": 4980 }, { "epoch": 0.84, "grad_norm": 0.2138671875, "learning_rate": 0.00014315803201258058, "loss": 2.1251, "step": 4985 }, { "epoch": 0.85, "grad_norm": 0.2001953125, "learning_rate": 0.00014302464762919819, "loss": 2.1, "step": 4990 }, { "epoch": 0.85, "grad_norm": 0.2138671875, "learning_rate": 0.00014289116924413132, "loss": 2.1501, "step": 4995 }, { "epoch": 0.85, "grad_norm": 0.2001953125, "learning_rate": 0.00014275759714900806, "loss": 2.135, "step": 5000 }, { "epoch": 0.85, "grad_norm": 0.2138671875, "learning_rate": 0.0001426239316356611, "loss": 2.1379, "step": 5005 }, { "epoch": 0.85, "grad_norm": 0.208984375, "learning_rate": 0.00014249017299612735, "loss": 2.1039, "step": 5010 }, { "epoch": 0.85, "grad_norm": 0.2060546875, "learning_rate": 0.00014235632152264716, "loss": 2.1887, "step": 5015 }, { "epoch": 0.85, "grad_norm": 0.208984375, "learning_rate": 0.00014222237750766365, "loss": 2.1571, "step": 5020 }, { "epoch": 0.85, "grad_norm": 0.216796875, "learning_rate": 0.0001420883412438222, "loss": 2.1553, "step": 5025 }, { "epoch": 0.85, "grad_norm": 0.20703125, "learning_rate": 0.00014195421302396968, "loss": 2.1225, "step": 5030 }, { "epoch": 0.85, "grad_norm": 0.2109375, "learning_rate": 0.00014181999314115393, "loss": 2.151, "step": 5035 }, { "epoch": 0.85, "grad_norm": 0.205078125, "learning_rate": 0.000141685681888623, "loss": 2.1276, "step": 5040 }, { "epoch": 0.85, "grad_norm": 0.20703125, "learning_rate": 0.0001415512795598246, "loss": 2.162, "step": 5045 }, { "epoch": 0.86, "grad_norm": 0.205078125, "learning_rate": 0.00014141678644840542, "loss": 2.1619, "step": 5050 }, { "epoch": 0.86, "grad_norm": 0.212890625, "learning_rate": 0.0001412822028482105, "loss": 2.1493, "step": 5055 }, { "epoch": 0.86, "grad_norm": 0.1982421875, "learning_rate": 0.00014114752905328257, "loss": 2.1496, "step": 5060 }, { "epoch": 0.86, "grad_norm": 0.2099609375, "learning_rate": 0.00014101276535786138, "loss": 2.1648, "step": 5065 }, { "epoch": 0.86, "grad_norm": 0.208984375, "learning_rate": 0.00014087791205638324, "loss": 2.1168, "step": 5070 }, { "epoch": 0.86, "grad_norm": 0.2109375, "learning_rate": 0.00014074296944348007, "loss": 2.1447, "step": 5075 }, { "epoch": 0.86, "grad_norm": 0.216796875, "learning_rate": 0.000140607937813979, "loss": 2.1338, "step": 5080 }, { "epoch": 0.86, "grad_norm": 0.212890625, "learning_rate": 0.00014047281746290167, "loss": 2.1485, "step": 5085 }, { "epoch": 0.86, "grad_norm": 0.2060546875, "learning_rate": 0.00014033760868546345, "loss": 2.1682, "step": 5090 }, { "epoch": 0.86, "grad_norm": 0.2109375, "learning_rate": 0.00014020231177707307, "loss": 2.1584, "step": 5095 }, { "epoch": 0.86, "grad_norm": 0.2109375, "learning_rate": 0.00014006692703333171, "loss": 2.1144, "step": 5100 }, { "epoch": 0.86, "grad_norm": 0.201171875, "learning_rate": 0.00013993145475003243, "loss": 2.1796, "step": 5105 }, { "epoch": 0.87, "grad_norm": 0.205078125, "learning_rate": 0.00013979589522315959, "loss": 2.1514, "step": 5110 }, { "epoch": 0.87, "grad_norm": 0.203125, "learning_rate": 0.00013966024874888821, "loss": 2.101, "step": 5115 }, { "epoch": 0.87, "grad_norm": 0.2109375, "learning_rate": 0.0001395245156235832, "loss": 2.1363, "step": 5120 }, { "epoch": 0.87, "grad_norm": 0.20703125, "learning_rate": 0.00013938869614379883, "loss": 2.145, "step": 5125 }, { "epoch": 0.87, "grad_norm": 0.2041015625, "learning_rate": 0.000139252790606278, "loss": 2.1134, "step": 5130 }, { "epoch": 0.87, "grad_norm": 0.2060546875, "learning_rate": 0.0001391167993079517, "loss": 2.1702, "step": 5135 }, { "epoch": 0.87, "grad_norm": 0.205078125, "learning_rate": 0.00013898072254593823, "loss": 2.1752, "step": 5140 }, { "epoch": 0.87, "grad_norm": 0.2060546875, "learning_rate": 0.00013884456061754265, "loss": 2.1614, "step": 5145 }, { "epoch": 0.87, "grad_norm": 0.201171875, "learning_rate": 0.00013870831382025602, "loss": 2.1189, "step": 5150 }, { "epoch": 0.87, "grad_norm": 0.205078125, "learning_rate": 0.00013857198245175497, "loss": 2.1356, "step": 5155 }, { "epoch": 0.87, "grad_norm": 0.20703125, "learning_rate": 0.00013843556680990078, "loss": 2.1685, "step": 5160 }, { "epoch": 0.87, "grad_norm": 0.205078125, "learning_rate": 0.00013829906719273885, "loss": 2.1494, "step": 5165 }, { "epoch": 0.88, "grad_norm": 0.2109375, "learning_rate": 0.0001381624838984982, "loss": 2.1311, "step": 5170 }, { "epoch": 0.88, "grad_norm": 0.2158203125, "learning_rate": 0.00013802581722559048, "loss": 2.1802, "step": 5175 }, { "epoch": 0.88, "grad_norm": 0.2109375, "learning_rate": 0.0001378890674726096, "loss": 2.1341, "step": 5180 }, { "epoch": 0.88, "grad_norm": 0.2109375, "learning_rate": 0.000137752234938331, "loss": 2.1643, "step": 5185 }, { "epoch": 0.88, "grad_norm": 0.20703125, "learning_rate": 0.00013761531992171095, "loss": 2.1469, "step": 5190 }, { "epoch": 0.88, "grad_norm": 0.2099609375, "learning_rate": 0.00013747832272188596, "loss": 2.1857, "step": 5195 }, { "epoch": 0.88, "grad_norm": 0.2138671875, "learning_rate": 0.00013734124363817208, "loss": 2.1803, "step": 5200 }, { "epoch": 0.88, "grad_norm": 0.2099609375, "learning_rate": 0.0001372040829700642, "loss": 2.1367, "step": 5205 }, { "epoch": 0.88, "grad_norm": 0.2021484375, "learning_rate": 0.00013706684101723562, "loss": 2.1175, "step": 5210 }, { "epoch": 0.88, "grad_norm": 0.2060546875, "learning_rate": 0.00013692951807953708, "loss": 2.1463, "step": 5215 }, { "epoch": 0.88, "grad_norm": 0.2138671875, "learning_rate": 0.00013679211445699632, "loss": 2.1948, "step": 5220 }, { "epoch": 0.88, "grad_norm": 0.20703125, "learning_rate": 0.0001366546304498173, "loss": 2.1677, "step": 5225 }, { "epoch": 0.89, "grad_norm": 0.208984375, "learning_rate": 0.00013651706635837976, "loss": 2.1749, "step": 5230 }, { "epoch": 0.89, "grad_norm": 0.2041015625, "learning_rate": 0.00013637942248323828, "loss": 2.1, "step": 5235 }, { "epoch": 0.89, "grad_norm": 0.205078125, "learning_rate": 0.00013624169912512173, "loss": 2.1391, "step": 5240 }, { "epoch": 0.89, "grad_norm": 0.20703125, "learning_rate": 0.00013610389658493276, "loss": 2.1248, "step": 5245 }, { "epoch": 0.89, "grad_norm": 0.2099609375, "learning_rate": 0.00013596601516374697, "loss": 2.1287, "step": 5250 }, { "epoch": 0.89, "grad_norm": 0.2080078125, "learning_rate": 0.00013582805516281217, "loss": 2.1049, "step": 5255 }, { "epoch": 0.89, "grad_norm": 0.21484375, "learning_rate": 0.0001356900168835481, "loss": 2.1764, "step": 5260 }, { "epoch": 0.89, "grad_norm": 0.2392578125, "learning_rate": 0.00013555190062754534, "loss": 2.163, "step": 5265 }, { "epoch": 0.89, "grad_norm": 0.2099609375, "learning_rate": 0.00013541370669656487, "loss": 2.1276, "step": 5270 }, { "epoch": 0.89, "grad_norm": 0.2109375, "learning_rate": 0.00013527543539253742, "loss": 2.1712, "step": 5275 }, { "epoch": 0.89, "grad_norm": 0.208984375, "learning_rate": 0.00013513708701756277, "loss": 2.1763, "step": 5280 }, { "epoch": 0.9, "grad_norm": 0.2021484375, "learning_rate": 0.000134998661873909, "loss": 2.1273, "step": 5285 }, { "epoch": 0.9, "grad_norm": 0.2099609375, "learning_rate": 0.00013486016026401202, "loss": 2.1176, "step": 5290 }, { "epoch": 0.9, "grad_norm": 0.203125, "learning_rate": 0.00013472158249047477, "loss": 2.1221, "step": 5295 }, { "epoch": 0.9, "grad_norm": 0.2138671875, "learning_rate": 0.00013458292885606656, "loss": 2.1508, "step": 5300 }, { "epoch": 0.9, "grad_norm": 0.20703125, "learning_rate": 0.00013444419966372252, "loss": 2.1472, "step": 5305 }, { "epoch": 0.9, "grad_norm": 0.21875, "learning_rate": 0.00013430539521654277, "loss": 2.2225, "step": 5310 }, { "epoch": 0.9, "grad_norm": 0.2021484375, "learning_rate": 0.0001341665158177919, "loss": 2.1097, "step": 5315 }, { "epoch": 0.9, "grad_norm": 0.205078125, "learning_rate": 0.00013402756177089827, "loss": 2.1191, "step": 5320 }, { "epoch": 0.9, "grad_norm": 0.2119140625, "learning_rate": 0.0001338885333794533, "loss": 2.1713, "step": 5325 }, { "epoch": 0.9, "grad_norm": 0.205078125, "learning_rate": 0.00013374943094721084, "loss": 2.1795, "step": 5330 }, { "epoch": 0.9, "grad_norm": 0.2138671875, "learning_rate": 0.00013361025477808656, "loss": 2.1675, "step": 5335 }, { "epoch": 0.9, "grad_norm": 0.2099609375, "learning_rate": 0.00013347100517615716, "loss": 2.1828, "step": 5340 }, { "epoch": 0.91, "grad_norm": 0.2099609375, "learning_rate": 0.0001333316824456598, "loss": 2.1384, "step": 5345 }, { "epoch": 0.91, "grad_norm": 0.2119140625, "learning_rate": 0.00013319228689099154, "loss": 2.1835, "step": 5350 }, { "epoch": 0.91, "grad_norm": 0.205078125, "learning_rate": 0.00013305281881670827, "loss": 2.1461, "step": 5355 }, { "epoch": 0.91, "grad_norm": 0.208984375, "learning_rate": 0.00013291327852752458, "loss": 2.1473, "step": 5360 }, { "epoch": 0.91, "grad_norm": 0.22265625, "learning_rate": 0.00013277366632831271, "loss": 2.1584, "step": 5365 }, { "epoch": 0.91, "grad_norm": 0.21484375, "learning_rate": 0.0001326339825241021, "loss": 2.1336, "step": 5370 }, { "epoch": 0.91, "grad_norm": 0.205078125, "learning_rate": 0.00013249422742007852, "loss": 2.1211, "step": 5375 }, { "epoch": 0.91, "grad_norm": 0.2060546875, "learning_rate": 0.00013235440132158366, "loss": 2.1066, "step": 5380 }, { "epoch": 0.91, "grad_norm": 0.212890625, "learning_rate": 0.00013221450453411413, "loss": 2.1636, "step": 5385 }, { "epoch": 0.91, "grad_norm": 0.20703125, "learning_rate": 0.00013207453736332117, "loss": 2.1426, "step": 5390 }, { "epoch": 0.91, "grad_norm": 0.2138671875, "learning_rate": 0.0001319345001150097, "loss": 2.178, "step": 5395 }, { "epoch": 0.91, "grad_norm": 0.205078125, "learning_rate": 0.0001317943930951378, "loss": 2.1224, "step": 5400 }, { "epoch": 0.92, "grad_norm": 0.2216796875, "learning_rate": 0.00013165421660981592, "loss": 2.1353, "step": 5405 }, { "epoch": 0.92, "grad_norm": 0.2119140625, "learning_rate": 0.00013151397096530635, "loss": 2.1219, "step": 5410 }, { "epoch": 0.92, "grad_norm": 0.2158203125, "learning_rate": 0.0001313736564680224, "loss": 2.1358, "step": 5415 }, { "epoch": 0.92, "grad_norm": 0.208984375, "learning_rate": 0.0001312332734245279, "loss": 2.1331, "step": 5420 }, { "epoch": 0.92, "grad_norm": 0.2060546875, "learning_rate": 0.0001310928221415364, "loss": 2.126, "step": 5425 }, { "epoch": 0.92, "grad_norm": 0.2119140625, "learning_rate": 0.00013095230292591055, "loss": 2.1418, "step": 5430 }, { "epoch": 0.92, "grad_norm": 0.2099609375, "learning_rate": 0.0001308117160846614, "loss": 2.1893, "step": 5435 }, { "epoch": 0.92, "grad_norm": 0.208984375, "learning_rate": 0.0001306710619249478, "loss": 2.1608, "step": 5440 }, { "epoch": 0.92, "grad_norm": 0.2041015625, "learning_rate": 0.00013053034075407555, "loss": 2.1653, "step": 5445 }, { "epoch": 0.92, "grad_norm": 0.2109375, "learning_rate": 0.00013038955287949708, "loss": 2.141, "step": 5450 }, { "epoch": 0.92, "grad_norm": 0.212890625, "learning_rate": 0.00013024869860881036, "loss": 2.145, "step": 5455 }, { "epoch": 0.92, "grad_norm": 0.2099609375, "learning_rate": 0.00013010777824975852, "loss": 2.1504, "step": 5460 }, { "epoch": 0.93, "grad_norm": 0.2119140625, "learning_rate": 0.00012996679211022908, "loss": 2.1545, "step": 5465 }, { "epoch": 0.93, "grad_norm": 0.2109375, "learning_rate": 0.00012982574049825324, "loss": 2.118, "step": 5470 }, { "epoch": 0.93, "grad_norm": 0.203125, "learning_rate": 0.00012968462372200517, "loss": 2.1523, "step": 5475 }, { "epoch": 0.93, "grad_norm": 0.2109375, "learning_rate": 0.00012954344208980167, "loss": 2.1289, "step": 5480 }, { "epoch": 0.93, "grad_norm": 0.2138671875, "learning_rate": 0.000129402195910101, "loss": 2.15, "step": 5485 }, { "epoch": 0.93, "grad_norm": 0.2158203125, "learning_rate": 0.00012926088549150246, "loss": 2.1693, "step": 5490 }, { "epoch": 0.93, "grad_norm": 0.2041015625, "learning_rate": 0.00012911951114274588, "loss": 2.1559, "step": 5495 }, { "epoch": 0.93, "grad_norm": 0.2158203125, "learning_rate": 0.0001289780731727106, "loss": 2.1352, "step": 5500 }, { "epoch": 0.93, "grad_norm": 0.2109375, "learning_rate": 0.00012883657189041495, "loss": 2.1314, "step": 5505 }, { "epoch": 0.93, "grad_norm": 0.208984375, "learning_rate": 0.00012869500760501572, "loss": 2.1777, "step": 5510 }, { "epoch": 0.93, "grad_norm": 0.2109375, "learning_rate": 0.00012855338062580732, "loss": 2.1191, "step": 5515 }, { "epoch": 0.93, "grad_norm": 0.2099609375, "learning_rate": 0.000128411691262221, "loss": 2.1499, "step": 5520 }, { "epoch": 0.94, "grad_norm": 0.212890625, "learning_rate": 0.00012826993982382448, "loss": 2.1447, "step": 5525 }, { "epoch": 0.94, "grad_norm": 0.2099609375, "learning_rate": 0.00012812812662032098, "loss": 2.1268, "step": 5530 }, { "epoch": 0.94, "grad_norm": 0.2119140625, "learning_rate": 0.0001279862519615487, "loss": 2.15, "step": 5535 }, { "epoch": 0.94, "grad_norm": 0.205078125, "learning_rate": 0.0001278443161574802, "loss": 2.1364, "step": 5540 }, { "epoch": 0.94, "grad_norm": 0.203125, "learning_rate": 0.00012770231951822144, "loss": 2.1466, "step": 5545 }, { "epoch": 0.94, "grad_norm": 0.208984375, "learning_rate": 0.00012756026235401154, "loss": 2.1302, "step": 5550 }, { "epoch": 0.94, "grad_norm": 0.2099609375, "learning_rate": 0.00012741814497522165, "loss": 2.1373, "step": 5555 }, { "epoch": 0.94, "grad_norm": 0.212890625, "learning_rate": 0.0001272759676923546, "loss": 2.1432, "step": 5560 }, { "epoch": 0.94, "grad_norm": 0.2060546875, "learning_rate": 0.00012713373081604397, "loss": 2.128, "step": 5565 }, { "epoch": 0.94, "grad_norm": 0.208984375, "learning_rate": 0.00012699143465705378, "loss": 2.1319, "step": 5570 }, { "epoch": 0.94, "grad_norm": 0.2041015625, "learning_rate": 0.0001268490795262773, "loss": 2.1415, "step": 5575 }, { "epoch": 0.95, "grad_norm": 0.2109375, "learning_rate": 0.00012670666573473676, "loss": 2.21, "step": 5580 }, { "epoch": 0.95, "grad_norm": 0.20703125, "learning_rate": 0.00012656419359358261, "loss": 2.1752, "step": 5585 }, { "epoch": 0.95, "grad_norm": 0.2080078125, "learning_rate": 0.00012642166341409277, "loss": 2.1218, "step": 5590 }, { "epoch": 0.95, "grad_norm": 0.2041015625, "learning_rate": 0.00012627907550767187, "loss": 2.1361, "step": 5595 }, { "epoch": 0.95, "grad_norm": 0.20703125, "learning_rate": 0.0001261364301858507, "loss": 2.1305, "step": 5600 }, { "epoch": 0.95, "grad_norm": 0.2041015625, "learning_rate": 0.00012599372776028557, "loss": 2.1319, "step": 5605 }, { "epoch": 0.95, "grad_norm": 0.208984375, "learning_rate": 0.0001258509685427575, "loss": 2.1559, "step": 5610 }, { "epoch": 0.95, "grad_norm": 0.212890625, "learning_rate": 0.00012570815284517153, "loss": 2.1181, "step": 5615 }, { "epoch": 0.95, "grad_norm": 0.21875, "learning_rate": 0.00012556528097955617, "loss": 2.1424, "step": 5620 }, { "epoch": 0.95, "grad_norm": 0.2216796875, "learning_rate": 0.00012542235325806267, "loss": 2.1025, "step": 5625 }, { "epoch": 0.95, "grad_norm": 0.2099609375, "learning_rate": 0.00012527936999296428, "loss": 2.2013, "step": 5630 }, { "epoch": 0.95, "grad_norm": 0.205078125, "learning_rate": 0.00012513633149665557, "loss": 2.1427, "step": 5635 }, { "epoch": 0.96, "grad_norm": 0.2119140625, "learning_rate": 0.00012499323808165183, "loss": 2.1794, "step": 5640 }, { "epoch": 0.96, "grad_norm": 0.205078125, "learning_rate": 0.00012485009006058835, "loss": 2.1722, "step": 5645 }, { "epoch": 0.96, "grad_norm": 0.2177734375, "learning_rate": 0.00012470688774621964, "loss": 2.1241, "step": 5650 }, { "epoch": 0.96, "grad_norm": 0.21484375, "learning_rate": 0.00012456363145141894, "loss": 2.1439, "step": 5655 }, { "epoch": 0.96, "grad_norm": 0.2060546875, "learning_rate": 0.00012442032148917738, "loss": 2.1363, "step": 5660 }, { "epoch": 0.96, "grad_norm": 0.212890625, "learning_rate": 0.00012427695817260329, "loss": 2.1426, "step": 5665 }, { "epoch": 0.96, "grad_norm": 0.2119140625, "learning_rate": 0.0001241335418149217, "loss": 2.1132, "step": 5670 }, { "epoch": 0.96, "grad_norm": 0.208984375, "learning_rate": 0.00012399007272947341, "loss": 2.1441, "step": 5675 }, { "epoch": 0.96, "grad_norm": 0.2158203125, "learning_rate": 0.00012384655122971445, "loss": 2.1381, "step": 5680 }, { "epoch": 0.96, "grad_norm": 0.2119140625, "learning_rate": 0.00012370297762921538, "loss": 2.1614, "step": 5685 }, { "epoch": 0.96, "grad_norm": 0.2060546875, "learning_rate": 0.0001235593522416606, "loss": 2.1412, "step": 5690 }, { "epoch": 0.96, "grad_norm": 0.2138671875, "learning_rate": 0.00012341567538084764, "loss": 2.1509, "step": 5695 }, { "epoch": 0.97, "grad_norm": 0.22265625, "learning_rate": 0.00012327194736068653, "loss": 2.1336, "step": 5700 }, { "epoch": 0.97, "grad_norm": 0.212890625, "learning_rate": 0.00012312816849519899, "loss": 2.1298, "step": 5705 }, { "epoch": 0.97, "grad_norm": 0.216796875, "learning_rate": 0.00012298433909851785, "loss": 2.189, "step": 5710 }, { "epoch": 0.97, "grad_norm": 0.2216796875, "learning_rate": 0.00012284045948488648, "loss": 2.1234, "step": 5715 }, { "epoch": 0.97, "grad_norm": 0.2041015625, "learning_rate": 0.00012269652996865776, "loss": 2.1426, "step": 5720 }, { "epoch": 0.97, "grad_norm": 0.2177734375, "learning_rate": 0.00012255255086429372, "loss": 2.2038, "step": 5725 }, { "epoch": 0.97, "grad_norm": 0.2177734375, "learning_rate": 0.00012240852248636473, "loss": 2.1829, "step": 5730 }, { "epoch": 0.97, "grad_norm": 0.2099609375, "learning_rate": 0.00012226444514954878, "loss": 2.1347, "step": 5735 }, { "epoch": 0.97, "grad_norm": 0.2041015625, "learning_rate": 0.00012212031916863082, "loss": 2.1792, "step": 5740 }, { "epoch": 0.97, "grad_norm": 0.2080078125, "learning_rate": 0.0001219761448585021, "loss": 2.1241, "step": 5745 }, { "epoch": 0.97, "grad_norm": 0.21484375, "learning_rate": 0.00012183192253415952, "loss": 2.1887, "step": 5750 }, { "epoch": 0.97, "grad_norm": 0.208984375, "learning_rate": 0.00012168765251070473, "loss": 2.1419, "step": 5755 }, { "epoch": 0.98, "grad_norm": 0.2119140625, "learning_rate": 0.00012154333510334375, "loss": 2.1507, "step": 5760 }, { "epoch": 0.98, "grad_norm": 0.2138671875, "learning_rate": 0.00012139897062738606, "loss": 2.1603, "step": 5765 }, { "epoch": 0.98, "grad_norm": 0.2119140625, "learning_rate": 0.00012125455939824393, "loss": 2.1563, "step": 5770 }, { "epoch": 0.98, "grad_norm": 0.2119140625, "learning_rate": 0.0001211101017314319, "loss": 2.1201, "step": 5775 }, { "epoch": 0.98, "grad_norm": 0.2158203125, "learning_rate": 0.00012096559794256581, "loss": 2.1329, "step": 5780 }, { "epoch": 0.98, "grad_norm": 0.208984375, "learning_rate": 0.00012082104834736244, "loss": 2.1179, "step": 5785 }, { "epoch": 0.98, "grad_norm": 0.208984375, "learning_rate": 0.0001206764532616385, "loss": 2.1557, "step": 5790 }, { "epoch": 0.98, "grad_norm": 0.2080078125, "learning_rate": 0.00012053181300131022, "loss": 2.1671, "step": 5795 }, { "epoch": 0.98, "grad_norm": 0.2138671875, "learning_rate": 0.00012038712788239236, "loss": 2.1472, "step": 5800 }, { "epoch": 0.98, "grad_norm": 0.2158203125, "learning_rate": 0.00012024239822099792, "loss": 2.1443, "step": 5805 }, { "epoch": 0.98, "grad_norm": 0.212890625, "learning_rate": 0.000120097624333337, "loss": 2.1556, "step": 5810 }, { "epoch": 0.98, "grad_norm": 0.21484375, "learning_rate": 0.00011995280653571641, "loss": 2.122, "step": 5815 }, { "epoch": 0.99, "grad_norm": 0.212890625, "learning_rate": 0.00011980794514453897, "loss": 2.0965, "step": 5820 }, { "epoch": 0.99, "grad_norm": 0.205078125, "learning_rate": 0.00011966304047630263, "loss": 2.1735, "step": 5825 }, { "epoch": 0.99, "grad_norm": 0.21484375, "learning_rate": 0.00011951809284759993, "loss": 2.1382, "step": 5830 }, { "epoch": 0.99, "grad_norm": 0.2138671875, "learning_rate": 0.00011937310257511732, "loss": 2.1571, "step": 5835 }, { "epoch": 0.99, "grad_norm": 0.2041015625, "learning_rate": 0.00011922806997563435, "loss": 2.1056, "step": 5840 }, { "epoch": 0.99, "grad_norm": 0.2099609375, "learning_rate": 0.0001190829953660231, "loss": 2.1016, "step": 5845 }, { "epoch": 0.99, "grad_norm": 0.212890625, "learning_rate": 0.00011893787906324738, "loss": 2.1113, "step": 5850 }, { "epoch": 0.99, "grad_norm": 0.2080078125, "learning_rate": 0.0001187927213843622, "loss": 2.1492, "step": 5855 }, { "epoch": 0.99, "grad_norm": 0.2119140625, "learning_rate": 0.0001186475226465128, "loss": 2.1852, "step": 5860 }, { "epoch": 0.99, "grad_norm": 0.21484375, "learning_rate": 0.00011850228316693428, "loss": 2.1586, "step": 5865 }, { "epoch": 0.99, "grad_norm": 0.2080078125, "learning_rate": 0.00011835700326295067, "loss": 2.156, "step": 5870 }, { "epoch": 1.0, "grad_norm": 0.212890625, "learning_rate": 0.00011821168325197436, "loss": 2.1291, "step": 5875 }, { "epoch": 1.0, "grad_norm": 0.220703125, "learning_rate": 0.00011806632345150538, "loss": 2.1686, "step": 5880 }, { "epoch": 1.0, "grad_norm": 0.2099609375, "learning_rate": 0.00011792092417913063, "loss": 2.1898, "step": 5885 }, { "epoch": 1.0, "grad_norm": 0.2109375, "learning_rate": 0.0001177754857525233, "loss": 2.1896, "step": 5890 }, { "epoch": 1.0, "grad_norm": 0.2021484375, "learning_rate": 0.00011763000848944212, "loss": 2.1315, "step": 5895 }, { "epoch": 1.0, "grad_norm": 0.205078125, "learning_rate": 0.00011748449270773066, "loss": 2.1473, "step": 5900 }, { "epoch": 1.0, "eval_loss": 2.1470842361450195, "eval_runtime": 161.6105, "eval_samples_per_second": 16.441, "eval_steps_per_second": 2.061, "step": 5904 }, { "epoch": 1.0, "grad_norm": 0.2197265625, "learning_rate": 0.00011733893872531664, "loss": 2.1194, "step": 5905 }, { "epoch": 1.0, "grad_norm": 0.208984375, "learning_rate": 0.00011719334686021129, "loss": 2.1479, "step": 5910 }, { "epoch": 1.0, "grad_norm": 0.2099609375, "learning_rate": 0.00011704771743050851, "loss": 2.1207, "step": 5915 }, { "epoch": 1.0, "grad_norm": 0.2109375, "learning_rate": 0.00011690205075438438, "loss": 2.157, "step": 5920 }, { "epoch": 1.0, "grad_norm": 0.2158203125, "learning_rate": 0.00011675634715009631, "loss": 2.1193, "step": 5925 }, { "epoch": 1.0, "grad_norm": 0.2119140625, "learning_rate": 0.00011661060693598233, "loss": 2.1361, "step": 5930 }, { "epoch": 1.01, "grad_norm": 0.2109375, "learning_rate": 0.00011646483043046063, "loss": 2.124, "step": 5935 }, { "epoch": 1.01, "grad_norm": 0.208984375, "learning_rate": 0.00011631901795202849, "loss": 2.1033, "step": 5940 }, { "epoch": 1.01, "grad_norm": 0.212890625, "learning_rate": 0.0001161731698192619, "loss": 2.1761, "step": 5945 }, { "epoch": 1.01, "grad_norm": 0.2177734375, "learning_rate": 0.0001160272863508147, "loss": 2.1375, "step": 5950 }, { "epoch": 1.01, "grad_norm": 0.220703125, "learning_rate": 0.00011588136786541802, "loss": 2.1671, "step": 5955 }, { "epoch": 1.01, "grad_norm": 0.21484375, "learning_rate": 0.00011573541468187936, "loss": 2.1324, "step": 5960 }, { "epoch": 1.01, "grad_norm": 0.216796875, "learning_rate": 0.00011558942711908212, "loss": 2.1454, "step": 5965 }, { "epoch": 1.01, "grad_norm": 0.2099609375, "learning_rate": 0.00011544340549598482, "loss": 2.1152, "step": 5970 }, { "epoch": 1.01, "grad_norm": 0.212890625, "learning_rate": 0.00011529735013162036, "loss": 2.1125, "step": 5975 }, { "epoch": 1.01, "grad_norm": 0.2109375, "learning_rate": 0.00011515126134509533, "loss": 2.1649, "step": 5980 }, { "epoch": 1.01, "grad_norm": 0.2109375, "learning_rate": 0.00011500513945558947, "loss": 2.1339, "step": 5985 }, { "epoch": 1.01, "grad_norm": 0.22265625, "learning_rate": 0.00011485898478235464, "loss": 2.1462, "step": 5990 }, { "epoch": 1.02, "grad_norm": 0.216796875, "learning_rate": 0.00011471279764471452, "loss": 2.1817, "step": 5995 }, { "epoch": 1.02, "grad_norm": 0.21484375, "learning_rate": 0.00011456657836206366, "loss": 2.1261, "step": 6000 }, { "epoch": 1.02, "grad_norm": 0.22265625, "learning_rate": 0.00011442032725386675, "loss": 2.1029, "step": 6005 }, { "epoch": 1.02, "grad_norm": 0.2158203125, "learning_rate": 0.00011427404463965814, "loss": 2.1269, "step": 6010 }, { "epoch": 1.02, "grad_norm": 0.20703125, "learning_rate": 0.00011412773083904094, "loss": 2.116, "step": 6015 }, { "epoch": 1.02, "grad_norm": 0.216796875, "learning_rate": 0.00011398138617168642, "loss": 2.1198, "step": 6020 }, { "epoch": 1.02, "grad_norm": 0.21875, "learning_rate": 0.0001138350109573333, "loss": 2.1262, "step": 6025 }, { "epoch": 1.02, "grad_norm": 0.21484375, "learning_rate": 0.00011368860551578702, "loss": 2.1268, "step": 6030 }, { "epoch": 1.02, "grad_norm": 0.2138671875, "learning_rate": 0.00011354217016691905, "loss": 2.157, "step": 6035 }, { "epoch": 1.02, "grad_norm": 0.212890625, "learning_rate": 0.0001133957052306663, "loss": 2.1132, "step": 6040 }, { "epoch": 1.02, "grad_norm": 0.2158203125, "learning_rate": 0.00011324921102703015, "loss": 2.1324, "step": 6045 }, { "epoch": 1.02, "grad_norm": 0.2099609375, "learning_rate": 0.00011310268787607603, "loss": 2.1372, "step": 6050 }, { "epoch": 1.03, "grad_norm": 0.224609375, "learning_rate": 0.00011295613609793267, "loss": 2.1227, "step": 6055 }, { "epoch": 1.03, "grad_norm": 0.2119140625, "learning_rate": 0.00011280955601279127, "loss": 2.1311, "step": 6060 }, { "epoch": 1.03, "grad_norm": 0.2109375, "learning_rate": 0.0001126629479409048, "loss": 2.1219, "step": 6065 }, { "epoch": 1.03, "grad_norm": 0.216796875, "learning_rate": 0.00011251631220258753, "loss": 2.0692, "step": 6070 }, { "epoch": 1.03, "grad_norm": 0.2177734375, "learning_rate": 0.00011236964911821413, "loss": 2.1236, "step": 6075 }, { "epoch": 1.03, "grad_norm": 0.2119140625, "learning_rate": 0.00011222295900821896, "loss": 2.1425, "step": 6080 }, { "epoch": 1.03, "grad_norm": 0.21484375, "learning_rate": 0.00011207624219309544, "loss": 2.1312, "step": 6085 }, { "epoch": 1.03, "grad_norm": 0.2275390625, "learning_rate": 0.00011192949899339544, "loss": 2.1528, "step": 6090 }, { "epoch": 1.03, "grad_norm": 0.2119140625, "learning_rate": 0.00011178272972972833, "loss": 2.1495, "step": 6095 }, { "epoch": 1.03, "grad_norm": 0.212890625, "learning_rate": 0.00011163593472276048, "loss": 2.1504, "step": 6100 }, { "epoch": 1.03, "grad_norm": 0.2099609375, "learning_rate": 0.00011148911429321456, "loss": 2.0733, "step": 6105 }, { "epoch": 1.03, "grad_norm": 0.21875, "learning_rate": 0.00011134226876186871, "loss": 2.0977, "step": 6110 }, { "epoch": 1.04, "grad_norm": 0.216796875, "learning_rate": 0.00011119539844955595, "loss": 2.1138, "step": 6115 }, { "epoch": 1.04, "grad_norm": 0.2177734375, "learning_rate": 0.00011104850367716344, "loss": 2.1027, "step": 6120 }, { "epoch": 1.04, "grad_norm": 0.216796875, "learning_rate": 0.00011090158476563175, "loss": 2.1559, "step": 6125 }, { "epoch": 1.04, "grad_norm": 0.212890625, "learning_rate": 0.00011075464203595427, "loss": 2.1822, "step": 6130 }, { "epoch": 1.04, "grad_norm": 0.2177734375, "learning_rate": 0.00011060767580917634, "loss": 2.1362, "step": 6135 }, { "epoch": 1.04, "grad_norm": 0.2138671875, "learning_rate": 0.00011046068640639464, "loss": 2.1073, "step": 6140 }, { "epoch": 1.04, "grad_norm": 0.22265625, "learning_rate": 0.00011031367414875658, "loss": 2.1463, "step": 6145 }, { "epoch": 1.04, "grad_norm": 0.224609375, "learning_rate": 0.0001101666393574594, "loss": 2.1327, "step": 6150 }, { "epoch": 1.04, "grad_norm": 0.2119140625, "learning_rate": 0.00011001958235374963, "loss": 2.1137, "step": 6155 }, { "epoch": 1.04, "grad_norm": 0.216796875, "learning_rate": 0.0001098725034589223, "loss": 2.1158, "step": 6160 }, { "epoch": 1.04, "grad_norm": 0.2197265625, "learning_rate": 0.00010972540299432033, "loss": 2.1705, "step": 6165 }, { "epoch": 1.04, "grad_norm": 0.228515625, "learning_rate": 0.00010957828128133363, "loss": 2.1357, "step": 6170 }, { "epoch": 1.05, "grad_norm": 0.2158203125, "learning_rate": 0.00010943113864139868, "loss": 2.1597, "step": 6175 }, { "epoch": 1.05, "grad_norm": 0.2265625, "learning_rate": 0.00010928397539599766, "loss": 2.1408, "step": 6180 }, { "epoch": 1.05, "grad_norm": 0.220703125, "learning_rate": 0.00010913679186665766, "loss": 2.1515, "step": 6185 }, { "epoch": 1.05, "grad_norm": 0.224609375, "learning_rate": 0.00010898958837495021, "loss": 2.1577, "step": 6190 }, { "epoch": 1.05, "grad_norm": 0.224609375, "learning_rate": 0.00010884236524249039, "loss": 2.1414, "step": 6195 }, { "epoch": 1.05, "grad_norm": 0.21875, "learning_rate": 0.0001086951227909362, "loss": 2.0958, "step": 6200 }, { "epoch": 1.05, "grad_norm": 0.216796875, "learning_rate": 0.00010854786134198786, "loss": 2.0813, "step": 6205 }, { "epoch": 1.05, "grad_norm": 0.2197265625, "learning_rate": 0.00010840058121738712, "loss": 2.107, "step": 6210 }, { "epoch": 1.05, "grad_norm": 0.2177734375, "learning_rate": 0.00010825328273891646, "loss": 2.1572, "step": 6215 }, { "epoch": 1.05, "grad_norm": 0.220703125, "learning_rate": 0.00010810596622839854, "loss": 2.1621, "step": 6220 }, { "epoch": 1.05, "grad_norm": 0.2197265625, "learning_rate": 0.00010795863200769538, "loss": 2.1263, "step": 6225 }, { "epoch": 1.06, "grad_norm": 0.216796875, "learning_rate": 0.00010781128039870769, "loss": 2.113, "step": 6230 }, { "epoch": 1.06, "grad_norm": 0.216796875, "learning_rate": 0.0001076639117233742, "loss": 2.1485, "step": 6235 }, { "epoch": 1.06, "grad_norm": 0.2255859375, "learning_rate": 0.00010751652630367086, "loss": 2.0961, "step": 6240 }, { "epoch": 1.06, "grad_norm": 0.21875, "learning_rate": 0.0001073691244616103, "loss": 2.1367, "step": 6245 }, { "epoch": 1.06, "grad_norm": 0.23828125, "learning_rate": 0.00010722170651924091, "loss": 2.1195, "step": 6250 }, { "epoch": 1.06, "grad_norm": 0.2294921875, "learning_rate": 0.00010707427279864637, "loss": 2.1521, "step": 6255 }, { "epoch": 1.06, "grad_norm": 0.21875, "learning_rate": 0.00010692682362194481, "loss": 2.1207, "step": 6260 }, { "epoch": 1.06, "grad_norm": 0.2197265625, "learning_rate": 0.00010677935931128807, "loss": 2.1476, "step": 6265 }, { "epoch": 1.06, "grad_norm": 0.21875, "learning_rate": 0.0001066318801888611, "loss": 2.0966, "step": 6270 }, { "epoch": 1.06, "grad_norm": 0.224609375, "learning_rate": 0.00010648438657688123, "loss": 2.1013, "step": 6275 }, { "epoch": 1.06, "grad_norm": 0.2216796875, "learning_rate": 0.00010633687879759738, "loss": 2.1487, "step": 6280 }, { "epoch": 1.06, "grad_norm": 0.2197265625, "learning_rate": 0.00010618935717328944, "loss": 2.1477, "step": 6285 }, { "epoch": 1.07, "grad_norm": 0.2236328125, "learning_rate": 0.00010604182202626765, "loss": 2.1778, "step": 6290 }, { "epoch": 1.07, "grad_norm": 0.234375, "learning_rate": 0.0001058942736788717, "loss": 2.1494, "step": 6295 }, { "epoch": 1.07, "grad_norm": 0.2138671875, "learning_rate": 0.00010574671245347005, "loss": 2.1321, "step": 6300 }, { "epoch": 1.07, "grad_norm": 0.216796875, "learning_rate": 0.00010559913867245952, "loss": 2.1529, "step": 6305 }, { "epoch": 1.07, "grad_norm": 0.21875, "learning_rate": 0.00010545155265826414, "loss": 2.1089, "step": 6310 }, { "epoch": 1.07, "grad_norm": 0.22265625, "learning_rate": 0.00010530395473333477, "loss": 2.1105, "step": 6315 }, { "epoch": 1.07, "grad_norm": 0.2197265625, "learning_rate": 0.00010515634522014828, "loss": 2.0971, "step": 6320 }, { "epoch": 1.07, "grad_norm": 0.2236328125, "learning_rate": 0.00010500872444120686, "loss": 2.1279, "step": 6325 }, { "epoch": 1.07, "grad_norm": 0.234375, "learning_rate": 0.0001048610927190373, "loss": 2.1178, "step": 6330 }, { "epoch": 1.07, "grad_norm": 0.2197265625, "learning_rate": 0.00010471345037619032, "loss": 2.1238, "step": 6335 }, { "epoch": 1.07, "grad_norm": 0.2236328125, "learning_rate": 0.0001045657977352398, "loss": 2.1127, "step": 6340 }, { "epoch": 1.07, "grad_norm": 0.2177734375, "learning_rate": 0.0001044181351187822, "loss": 2.1348, "step": 6345 }, { "epoch": 1.08, "grad_norm": 0.224609375, "learning_rate": 0.00010427046284943572, "loss": 2.1502, "step": 6350 }, { "epoch": 1.08, "grad_norm": 0.2255859375, "learning_rate": 0.0001041227812498396, "loss": 2.1288, "step": 6355 }, { "epoch": 1.08, "grad_norm": 0.216796875, "learning_rate": 0.00010397509064265359, "loss": 2.1578, "step": 6360 }, { "epoch": 1.08, "grad_norm": 0.2265625, "learning_rate": 0.00010382739135055703, "loss": 2.1656, "step": 6365 }, { "epoch": 1.08, "grad_norm": 0.220703125, "learning_rate": 0.00010367968369624825, "loss": 2.1261, "step": 6370 }, { "epoch": 1.08, "grad_norm": 0.21875, "learning_rate": 0.00010353196800244382, "loss": 2.1418, "step": 6375 }, { "epoch": 1.08, "grad_norm": 0.21875, "learning_rate": 0.00010338424459187801, "loss": 2.1163, "step": 6380 }, { "epoch": 1.08, "grad_norm": 0.220703125, "learning_rate": 0.00010323651378730179, "loss": 2.114, "step": 6385 }, { "epoch": 1.08, "grad_norm": 0.2265625, "learning_rate": 0.0001030887759114823, "loss": 2.0651, "step": 6390 }, { "epoch": 1.08, "grad_norm": 0.21484375, "learning_rate": 0.00010294103128720227, "loss": 2.1278, "step": 6395 }, { "epoch": 1.08, "grad_norm": 0.224609375, "learning_rate": 0.00010279328023725905, "loss": 2.1356, "step": 6400 }, { "epoch": 1.08, "grad_norm": 0.2197265625, "learning_rate": 0.00010264552308446403, "loss": 2.1141, "step": 6405 }, { "epoch": 1.09, "grad_norm": 0.220703125, "learning_rate": 0.00010249776015164197, "loss": 2.0926, "step": 6410 }, { "epoch": 1.09, "grad_norm": 0.224609375, "learning_rate": 0.00010234999176163026, "loss": 2.1441, "step": 6415 }, { "epoch": 1.09, "grad_norm": 0.224609375, "learning_rate": 0.00010220221823727822, "loss": 2.15, "step": 6420 }, { "epoch": 1.09, "grad_norm": 0.2177734375, "learning_rate": 0.00010205443990144636, "loss": 2.126, "step": 6425 }, { "epoch": 1.09, "grad_norm": 0.2265625, "learning_rate": 0.0001019066570770057, "loss": 2.1257, "step": 6430 }, { "epoch": 1.09, "grad_norm": 0.228515625, "learning_rate": 0.00010175887008683712, "loss": 2.1286, "step": 6435 }, { "epoch": 1.09, "grad_norm": 0.2255859375, "learning_rate": 0.00010161107925383054, "loss": 2.1173, "step": 6440 }, { "epoch": 1.09, "grad_norm": 0.21875, "learning_rate": 0.00010146328490088428, "loss": 2.1478, "step": 6445 }, { "epoch": 1.09, "grad_norm": 0.2216796875, "learning_rate": 0.00010131548735090437, "loss": 2.134, "step": 6450 }, { "epoch": 1.09, "grad_norm": 0.22265625, "learning_rate": 0.00010116768692680387, "loss": 2.1342, "step": 6455 }, { "epoch": 1.09, "grad_norm": 0.2294921875, "learning_rate": 0.00010101988395150203, "loss": 2.1318, "step": 6460 }, { "epoch": 1.09, "grad_norm": 0.2216796875, "learning_rate": 0.00010087207874792374, "loss": 2.1647, "step": 6465 }, { "epoch": 1.1, "grad_norm": 0.224609375, "learning_rate": 0.00010072427163899874, "loss": 2.1257, "step": 6470 }, { "epoch": 1.1, "grad_norm": 0.2158203125, "learning_rate": 0.0001005764629476609, "loss": 2.1106, "step": 6475 }, { "epoch": 1.1, "grad_norm": 0.224609375, "learning_rate": 0.0001004286529968476, "loss": 2.1002, "step": 6480 }, { "epoch": 1.1, "grad_norm": 0.22265625, "learning_rate": 0.00010028084210949895, "loss": 2.1074, "step": 6485 }, { "epoch": 1.1, "grad_norm": 0.2294921875, "learning_rate": 0.00010013303060855708, "loss": 2.0886, "step": 6490 }, { "epoch": 1.1, "grad_norm": 0.2255859375, "learning_rate": 9.998521881696551e-05, "loss": 2.0777, "step": 6495 }, { "epoch": 1.1, "grad_norm": 0.2216796875, "learning_rate": 9.98374070576684e-05, "loss": 2.1192, "step": 6500 }, { "epoch": 1.1, "grad_norm": 0.2080078125, "learning_rate": 9.968959565360973e-05, "loss": 2.103, "step": 6505 }, { "epoch": 1.1, "grad_norm": 0.2138671875, "learning_rate": 9.954178492773278e-05, "loss": 2.1614, "step": 6510 }, { "epoch": 1.1, "grad_norm": 0.2197265625, "learning_rate": 9.939397520297949e-05, "loss": 2.1397, "step": 6515 }, { "epoch": 1.1, "grad_norm": 0.2216796875, "learning_rate": 9.924616680228933e-05, "loss": 2.0756, "step": 6520 }, { "epoch": 1.11, "grad_norm": 0.2216796875, "learning_rate": 9.909836004859908e-05, "loss": 2.1093, "step": 6525 }, { "epoch": 1.11, "grad_norm": 0.21875, "learning_rate": 9.895055526484184e-05, "loss": 2.1218, "step": 6530 }, { "epoch": 1.11, "grad_norm": 0.224609375, "learning_rate": 9.880275277394644e-05, "loss": 2.1829, "step": 6535 }, { "epoch": 1.11, "grad_norm": 0.2177734375, "learning_rate": 9.865495289883672e-05, "loss": 2.1078, "step": 6540 }, { "epoch": 1.11, "grad_norm": 0.2294921875, "learning_rate": 9.850715596243073e-05, "loss": 2.1234, "step": 6545 }, { "epoch": 1.11, "grad_norm": 0.220703125, "learning_rate": 9.835936228764014e-05, "loss": 2.0701, "step": 6550 }, { "epoch": 1.11, "grad_norm": 0.2216796875, "learning_rate": 9.821157219736955e-05, "loss": 2.1111, "step": 6555 }, { "epoch": 1.11, "grad_norm": 0.2197265625, "learning_rate": 9.806378601451563e-05, "loss": 2.1091, "step": 6560 }, { "epoch": 1.11, "grad_norm": 0.22265625, "learning_rate": 9.791600406196656e-05, "loss": 2.1229, "step": 6565 }, { "epoch": 1.11, "grad_norm": 0.216796875, "learning_rate": 9.776822666260133e-05, "loss": 2.1289, "step": 6570 }, { "epoch": 1.11, "grad_norm": 0.2177734375, "learning_rate": 9.762045413928884e-05, "loss": 2.0959, "step": 6575 }, { "epoch": 1.11, "grad_norm": 0.216796875, "learning_rate": 9.747268681488749e-05, "loss": 2.1405, "step": 6580 }, { "epoch": 1.12, "grad_norm": 0.2197265625, "learning_rate": 9.732492501224426e-05, "loss": 2.1203, "step": 6585 }, { "epoch": 1.12, "grad_norm": 0.2265625, "learning_rate": 9.717716905419403e-05, "loss": 2.1509, "step": 6590 }, { "epoch": 1.12, "grad_norm": 0.22265625, "learning_rate": 9.702941926355897e-05, "loss": 2.1252, "step": 6595 }, { "epoch": 1.12, "grad_norm": 0.2158203125, "learning_rate": 9.688167596314772e-05, "loss": 2.1211, "step": 6600 }, { "epoch": 1.12, "grad_norm": 0.21875, "learning_rate": 9.673393947575477e-05, "loss": 2.1291, "step": 6605 }, { "epoch": 1.12, "grad_norm": 0.2216796875, "learning_rate": 9.658621012415974e-05, "loss": 2.1686, "step": 6610 }, { "epoch": 1.12, "grad_norm": 0.2197265625, "learning_rate": 9.643848823112664e-05, "loss": 2.1454, "step": 6615 }, { "epoch": 1.12, "grad_norm": 0.21875, "learning_rate": 9.629077411940318e-05, "loss": 2.1243, "step": 6620 }, { "epoch": 1.12, "grad_norm": 0.2177734375, "learning_rate": 9.614306811172009e-05, "loss": 2.1075, "step": 6625 }, { "epoch": 1.12, "grad_norm": 0.2177734375, "learning_rate": 9.599537053079037e-05, "loss": 2.1105, "step": 6630 }, { "epoch": 1.12, "grad_norm": 0.224609375, "learning_rate": 9.58476816993086e-05, "loss": 2.1203, "step": 6635 }, { "epoch": 1.12, "grad_norm": 0.2255859375, "learning_rate": 9.570000193995028e-05, "loss": 2.1075, "step": 6640 }, { "epoch": 1.13, "grad_norm": 0.228515625, "learning_rate": 9.555233157537109e-05, "loss": 2.1306, "step": 6645 }, { "epoch": 1.13, "grad_norm": 0.21484375, "learning_rate": 9.540467092820614e-05, "loss": 2.1238, "step": 6650 }, { "epoch": 1.13, "grad_norm": 0.2255859375, "learning_rate": 9.525702032106933e-05, "loss": 2.1468, "step": 6655 }, { "epoch": 1.13, "grad_norm": 0.2236328125, "learning_rate": 9.510938007655264e-05, "loss": 2.1477, "step": 6660 }, { "epoch": 1.13, "grad_norm": 0.22265625, "learning_rate": 9.496175051722542e-05, "loss": 2.1205, "step": 6665 }, { "epoch": 1.13, "grad_norm": 0.2158203125, "learning_rate": 9.481413196563362e-05, "loss": 2.1107, "step": 6670 }, { "epoch": 1.13, "grad_norm": 0.216796875, "learning_rate": 9.466652474429915e-05, "loss": 2.1116, "step": 6675 }, { "epoch": 1.13, "grad_norm": 0.2236328125, "learning_rate": 9.451892917571927e-05, "loss": 2.1433, "step": 6680 }, { "epoch": 1.13, "grad_norm": 0.2158203125, "learning_rate": 9.437134558236562e-05, "loss": 2.1305, "step": 6685 }, { "epoch": 1.13, "grad_norm": 0.2333984375, "learning_rate": 9.42237742866838e-05, "loss": 2.141, "step": 6690 }, { "epoch": 1.13, "grad_norm": 0.22265625, "learning_rate": 9.407621561109251e-05, "loss": 2.0987, "step": 6695 }, { "epoch": 1.13, "grad_norm": 0.22265625, "learning_rate": 9.392866987798277e-05, "loss": 2.1598, "step": 6700 }, { "epoch": 1.14, "grad_norm": 0.21875, "learning_rate": 9.378113740971754e-05, "loss": 2.1487, "step": 6705 }, { "epoch": 1.14, "grad_norm": 0.2158203125, "learning_rate": 9.363361852863058e-05, "loss": 2.1104, "step": 6710 }, { "epoch": 1.14, "grad_norm": 0.2216796875, "learning_rate": 9.348611355702608e-05, "loss": 2.1171, "step": 6715 }, { "epoch": 1.14, "grad_norm": 0.2177734375, "learning_rate": 9.333862281717788e-05, "loss": 2.1482, "step": 6720 }, { "epoch": 1.14, "grad_norm": 0.21484375, "learning_rate": 9.31911466313286e-05, "loss": 2.0912, "step": 6725 }, { "epoch": 1.14, "grad_norm": 0.2158203125, "learning_rate": 9.304368532168912e-05, "loss": 2.0972, "step": 6730 }, { "epoch": 1.14, "grad_norm": 0.2236328125, "learning_rate": 9.28962392104379e-05, "loss": 2.1025, "step": 6735 }, { "epoch": 1.14, "grad_norm": 0.224609375, "learning_rate": 9.274880861972005e-05, "loss": 2.0854, "step": 6740 }, { "epoch": 1.14, "grad_norm": 0.224609375, "learning_rate": 9.260139387164684e-05, "loss": 2.1208, "step": 6745 }, { "epoch": 1.14, "grad_norm": 0.22265625, "learning_rate": 9.245399528829501e-05, "loss": 2.1269, "step": 6750 }, { "epoch": 1.14, "grad_norm": 0.21875, "learning_rate": 9.230661319170578e-05, "loss": 2.0986, "step": 6755 }, { "epoch": 1.14, "grad_norm": 0.2236328125, "learning_rate": 9.215924790388451e-05, "loss": 2.1067, "step": 6760 }, { "epoch": 1.15, "grad_norm": 0.2275390625, "learning_rate": 9.201189974679986e-05, "loss": 2.1029, "step": 6765 }, { "epoch": 1.15, "grad_norm": 0.2197265625, "learning_rate": 9.186456904238292e-05, "loss": 2.1548, "step": 6770 }, { "epoch": 1.15, "grad_norm": 0.220703125, "learning_rate": 9.171725611252676e-05, "loss": 2.1147, "step": 6775 }, { "epoch": 1.15, "grad_norm": 0.2236328125, "learning_rate": 9.156996127908555e-05, "loss": 2.1242, "step": 6780 }, { "epoch": 1.15, "grad_norm": 0.23046875, "learning_rate": 9.142268486387398e-05, "loss": 2.0846, "step": 6785 }, { "epoch": 1.15, "grad_norm": 0.23828125, "learning_rate": 9.127542718866646e-05, "loss": 2.1363, "step": 6790 }, { "epoch": 1.15, "grad_norm": 0.220703125, "learning_rate": 9.112818857519647e-05, "loss": 2.1028, "step": 6795 }, { "epoch": 1.15, "grad_norm": 0.21875, "learning_rate": 9.098096934515583e-05, "loss": 2.1668, "step": 6800 }, { "epoch": 1.15, "grad_norm": 0.2236328125, "learning_rate": 9.083376982019406e-05, "loss": 2.1371, "step": 6805 }, { "epoch": 1.15, "grad_norm": 0.2255859375, "learning_rate": 9.068659032191753e-05, "loss": 2.1092, "step": 6810 }, { "epoch": 1.15, "grad_norm": 0.21875, "learning_rate": 9.053943117188896e-05, "loss": 2.1803, "step": 6815 }, { "epoch": 1.16, "grad_norm": 0.2255859375, "learning_rate": 9.039229269162656e-05, "loss": 2.1319, "step": 6820 }, { "epoch": 1.16, "grad_norm": 0.2236328125, "learning_rate": 9.024517520260339e-05, "loss": 2.1312, "step": 6825 }, { "epoch": 1.16, "grad_norm": 0.2255859375, "learning_rate": 9.009807902624662e-05, "loss": 2.1224, "step": 6830 }, { "epoch": 1.16, "grad_norm": 0.2197265625, "learning_rate": 8.99510044839369e-05, "loss": 2.1172, "step": 6835 }, { "epoch": 1.16, "grad_norm": 0.21484375, "learning_rate": 8.980395189700758e-05, "loss": 2.1406, "step": 6840 }, { "epoch": 1.16, "grad_norm": 0.22265625, "learning_rate": 8.965692158674408e-05, "loss": 2.1704, "step": 6845 }, { "epoch": 1.16, "grad_norm": 0.2216796875, "learning_rate": 8.950991387438308e-05, "loss": 2.0968, "step": 6850 }, { "epoch": 1.16, "grad_norm": 0.25, "learning_rate": 8.936292908111197e-05, "loss": 2.1551, "step": 6855 }, { "epoch": 1.16, "grad_norm": 0.2236328125, "learning_rate": 8.921596752806802e-05, "loss": 2.134, "step": 6860 }, { "epoch": 1.16, "grad_norm": 0.2236328125, "learning_rate": 8.906902953633771e-05, "loss": 2.1215, "step": 6865 }, { "epoch": 1.16, "grad_norm": 0.216796875, "learning_rate": 8.892211542695607e-05, "loss": 2.1057, "step": 6870 }, { "epoch": 1.16, "grad_norm": 0.2265625, "learning_rate": 8.877522552090598e-05, "loss": 2.14, "step": 6875 }, { "epoch": 1.17, "grad_norm": 0.2177734375, "learning_rate": 8.862836013911735e-05, "loss": 2.0927, "step": 6880 }, { "epoch": 1.17, "grad_norm": 0.2255859375, "learning_rate": 8.848151960246663e-05, "loss": 2.1415, "step": 6885 }, { "epoch": 1.17, "grad_norm": 0.21875, "learning_rate": 8.833470423177578e-05, "loss": 2.1684, "step": 6890 }, { "epoch": 1.17, "grad_norm": 0.21875, "learning_rate": 8.818791434781208e-05, "loss": 2.1264, "step": 6895 }, { "epoch": 1.17, "grad_norm": 0.220703125, "learning_rate": 8.804115027128692e-05, "loss": 2.0953, "step": 6900 }, { "epoch": 1.17, "grad_norm": 0.234375, "learning_rate": 8.789441232285524e-05, "loss": 2.1234, "step": 6905 }, { "epoch": 1.17, "grad_norm": 0.2236328125, "learning_rate": 8.774770082311512e-05, "loss": 2.1046, "step": 6910 }, { "epoch": 1.17, "grad_norm": 0.2265625, "learning_rate": 8.760101609260673e-05, "loss": 2.1038, "step": 6915 }, { "epoch": 1.17, "grad_norm": 0.21875, "learning_rate": 8.745435845181168e-05, "loss": 2.1039, "step": 6920 }, { "epoch": 1.17, "grad_norm": 0.2236328125, "learning_rate": 8.730772822115252e-05, "loss": 2.1259, "step": 6925 }, { "epoch": 1.17, "grad_norm": 0.216796875, "learning_rate": 8.716112572099193e-05, "loss": 2.0748, "step": 6930 }, { "epoch": 1.17, "grad_norm": 0.2275390625, "learning_rate": 8.701455127163181e-05, "loss": 2.1662, "step": 6935 }, { "epoch": 1.18, "grad_norm": 0.224609375, "learning_rate": 8.686800519331298e-05, "loss": 2.0985, "step": 6940 }, { "epoch": 1.18, "grad_norm": 0.2265625, "learning_rate": 8.672148780621423e-05, "loss": 2.1478, "step": 6945 }, { "epoch": 1.18, "grad_norm": 0.22265625, "learning_rate": 8.657499943045153e-05, "loss": 2.1141, "step": 6950 }, { "epoch": 1.18, "grad_norm": 0.224609375, "learning_rate": 8.642854038607769e-05, "loss": 2.1271, "step": 6955 }, { "epoch": 1.18, "grad_norm": 0.2197265625, "learning_rate": 8.628211099308119e-05, "loss": 2.1009, "step": 6960 }, { "epoch": 1.18, "grad_norm": 0.216796875, "learning_rate": 8.61357115713859e-05, "loss": 2.1326, "step": 6965 }, { "epoch": 1.18, "grad_norm": 0.2236328125, "learning_rate": 8.598934244085022e-05, "loss": 2.1111, "step": 6970 }, { "epoch": 1.18, "grad_norm": 0.22265625, "learning_rate": 8.584300392126621e-05, "loss": 2.1095, "step": 6975 }, { "epoch": 1.18, "grad_norm": 0.2236328125, "learning_rate": 8.569669633235917e-05, "loss": 2.1472, "step": 6980 }, { "epoch": 1.18, "grad_norm": 0.216796875, "learning_rate": 8.555041999378687e-05, "loss": 2.1558, "step": 6985 }, { "epoch": 1.18, "grad_norm": 0.2216796875, "learning_rate": 8.540417522513864e-05, "loss": 2.0741, "step": 6990 }, { "epoch": 1.18, "grad_norm": 0.220703125, "learning_rate": 8.525796234593493e-05, "loss": 2.1253, "step": 6995 }, { "epoch": 1.19, "grad_norm": 0.22265625, "learning_rate": 8.511178167562662e-05, "loss": 2.1103, "step": 7000 }, { "epoch": 1.19, "grad_norm": 0.2177734375, "learning_rate": 8.496563353359398e-05, "loss": 2.1059, "step": 7005 }, { "epoch": 1.19, "grad_norm": 0.220703125, "learning_rate": 8.481951823914642e-05, "loss": 2.0873, "step": 7010 }, { "epoch": 1.19, "grad_norm": 0.228515625, "learning_rate": 8.467343611152147e-05, "loss": 2.1031, "step": 7015 }, { "epoch": 1.19, "grad_norm": 0.310546875, "learning_rate": 8.452738746988425e-05, "loss": 2.1256, "step": 7020 }, { "epoch": 1.19, "grad_norm": 0.21484375, "learning_rate": 8.43813726333267e-05, "loss": 2.0955, "step": 7025 }, { "epoch": 1.19, "grad_norm": 0.224609375, "learning_rate": 8.42353919208669e-05, "loss": 2.1246, "step": 7030 }, { "epoch": 1.19, "grad_norm": 0.2275390625, "learning_rate": 8.408944565144838e-05, "loss": 2.1745, "step": 7035 }, { "epoch": 1.19, "grad_norm": 0.220703125, "learning_rate": 8.394353414393943e-05, "loss": 2.1093, "step": 7040 }, { "epoch": 1.19, "grad_norm": 0.234375, "learning_rate": 8.379765771713233e-05, "loss": 2.1581, "step": 7045 }, { "epoch": 1.19, "grad_norm": 0.21875, "learning_rate": 8.365181668974279e-05, "loss": 2.0691, "step": 7050 }, { "epoch": 1.19, "grad_norm": 0.216796875, "learning_rate": 8.350601138040917e-05, "loss": 2.1291, "step": 7055 }, { "epoch": 1.2, "grad_norm": 0.21875, "learning_rate": 8.336024210769172e-05, "loss": 2.1567, "step": 7060 }, { "epoch": 1.2, "grad_norm": 0.22265625, "learning_rate": 8.321450919007207e-05, "loss": 2.147, "step": 7065 }, { "epoch": 1.2, "grad_norm": 0.2236328125, "learning_rate": 8.30688129459523e-05, "loss": 2.1118, "step": 7070 }, { "epoch": 1.2, "grad_norm": 0.2216796875, "learning_rate": 8.292315369365442e-05, "loss": 2.0944, "step": 7075 }, { "epoch": 1.2, "grad_norm": 0.2333984375, "learning_rate": 8.27775317514197e-05, "loss": 2.0953, "step": 7080 }, { "epoch": 1.2, "grad_norm": 0.228515625, "learning_rate": 8.263194743740769e-05, "loss": 2.1687, "step": 7085 }, { "epoch": 1.2, "grad_norm": 0.220703125, "learning_rate": 8.248640106969595e-05, "loss": 2.1272, "step": 7090 }, { "epoch": 1.2, "grad_norm": 0.2216796875, "learning_rate": 8.234089296627903e-05, "loss": 2.1588, "step": 7095 }, { "epoch": 1.2, "grad_norm": 0.2265625, "learning_rate": 8.219542344506784e-05, "loss": 2.1068, "step": 7100 }, { "epoch": 1.2, "grad_norm": 0.2275390625, "learning_rate": 8.204999282388903e-05, "loss": 2.1186, "step": 7105 }, { "epoch": 1.2, "grad_norm": 0.2236328125, "learning_rate": 8.190460142048434e-05, "loss": 2.0982, "step": 7110 }, { "epoch": 1.21, "grad_norm": 0.224609375, "learning_rate": 8.175924955250971e-05, "loss": 2.09, "step": 7115 }, { "epoch": 1.21, "grad_norm": 0.2216796875, "learning_rate": 8.161393753753474e-05, "loss": 2.1447, "step": 7120 }, { "epoch": 1.21, "grad_norm": 0.22265625, "learning_rate": 8.146866569304199e-05, "loss": 2.0919, "step": 7125 }, { "epoch": 1.21, "grad_norm": 0.2236328125, "learning_rate": 8.13234343364262e-05, "loss": 2.1315, "step": 7130 }, { "epoch": 1.21, "grad_norm": 0.2255859375, "learning_rate": 8.117824378499374e-05, "loss": 2.1157, "step": 7135 }, { "epoch": 1.21, "grad_norm": 0.22265625, "learning_rate": 8.103309435596165e-05, "loss": 2.1379, "step": 7140 }, { "epoch": 1.21, "grad_norm": 0.2265625, "learning_rate": 8.088798636645733e-05, "loss": 2.1274, "step": 7145 }, { "epoch": 1.21, "grad_norm": 0.228515625, "learning_rate": 8.074292013351759e-05, "loss": 2.1492, "step": 7150 }, { "epoch": 1.21, "grad_norm": 0.22265625, "learning_rate": 8.059789597408785e-05, "loss": 2.1494, "step": 7155 }, { "epoch": 1.21, "grad_norm": 0.22265625, "learning_rate": 8.045291420502182e-05, "loss": 2.1487, "step": 7160 }, { "epoch": 1.21, "grad_norm": 0.220703125, "learning_rate": 8.030797514308052e-05, "loss": 2.1566, "step": 7165 }, { "epoch": 1.21, "grad_norm": 0.2197265625, "learning_rate": 8.016307910493153e-05, "loss": 2.0946, "step": 7170 }, { "epoch": 1.22, "grad_norm": 0.2216796875, "learning_rate": 8.001822640714865e-05, "loss": 2.11, "step": 7175 }, { "epoch": 1.22, "grad_norm": 0.22265625, "learning_rate": 7.987341736621089e-05, "loss": 2.1462, "step": 7180 }, { "epoch": 1.22, "grad_norm": 0.2255859375, "learning_rate": 7.972865229850176e-05, "loss": 2.0978, "step": 7185 }, { "epoch": 1.22, "grad_norm": 0.2177734375, "learning_rate": 7.958393152030894e-05, "loss": 2.1292, "step": 7190 }, { "epoch": 1.22, "grad_norm": 0.2294921875, "learning_rate": 7.943925534782311e-05, "loss": 2.1581, "step": 7195 }, { "epoch": 1.22, "grad_norm": 0.22265625, "learning_rate": 7.929462409713762e-05, "loss": 2.1376, "step": 7200 }, { "epoch": 1.22, "grad_norm": 0.2265625, "learning_rate": 7.915003808424771e-05, "loss": 2.1427, "step": 7205 }, { "epoch": 1.22, "grad_norm": 0.2197265625, "learning_rate": 7.900549762504963e-05, "loss": 2.1218, "step": 7210 }, { "epoch": 1.22, "grad_norm": 0.2275390625, "learning_rate": 7.886100303534022e-05, "loss": 2.1444, "step": 7215 }, { "epoch": 1.22, "grad_norm": 0.2255859375, "learning_rate": 7.871655463081615e-05, "loss": 2.1039, "step": 7220 }, { "epoch": 1.22, "grad_norm": 0.224609375, "learning_rate": 7.8572152727073e-05, "loss": 2.0897, "step": 7225 }, { "epoch": 1.22, "grad_norm": 0.2265625, "learning_rate": 7.842779763960493e-05, "loss": 2.0614, "step": 7230 }, { "epoch": 1.23, "grad_norm": 0.2177734375, "learning_rate": 7.828348968380374e-05, "loss": 2.1025, "step": 7235 }, { "epoch": 1.23, "grad_norm": 0.2138671875, "learning_rate": 7.813922917495824e-05, "loss": 2.1359, "step": 7240 }, { "epoch": 1.23, "grad_norm": 0.228515625, "learning_rate": 7.799501642825364e-05, "loss": 2.1166, "step": 7245 }, { "epoch": 1.23, "grad_norm": 0.234375, "learning_rate": 7.785085175877071e-05, "loss": 2.1249, "step": 7250 }, { "epoch": 1.23, "grad_norm": 0.22265625, "learning_rate": 7.770673548148524e-05, "loss": 2.1482, "step": 7255 }, { "epoch": 1.23, "grad_norm": 0.228515625, "learning_rate": 7.756266791126731e-05, "loss": 2.1217, "step": 7260 }, { "epoch": 1.23, "grad_norm": 0.23046875, "learning_rate": 7.74186493628805e-05, "loss": 2.0725, "step": 7265 }, { "epoch": 1.23, "grad_norm": 0.220703125, "learning_rate": 7.727468015098135e-05, "loss": 2.1105, "step": 7270 }, { "epoch": 1.23, "grad_norm": 0.2265625, "learning_rate": 7.713076059011864e-05, "loss": 2.0842, "step": 7275 }, { "epoch": 1.23, "grad_norm": 0.2275390625, "learning_rate": 7.698689099473254e-05, "loss": 2.1156, "step": 7280 }, { "epoch": 1.23, "grad_norm": 0.228515625, "learning_rate": 7.68430716791542e-05, "loss": 2.171, "step": 7285 }, { "epoch": 1.23, "grad_norm": 0.2314453125, "learning_rate": 7.669930295760486e-05, "loss": 2.1235, "step": 7290 }, { "epoch": 1.24, "grad_norm": 0.2265625, "learning_rate": 7.655558514419518e-05, "loss": 2.1518, "step": 7295 }, { "epoch": 1.24, "grad_norm": 0.2265625, "learning_rate": 7.641191855292464e-05, "loss": 2.0936, "step": 7300 }, { "epoch": 1.24, "grad_norm": 0.23046875, "learning_rate": 7.626830349768084e-05, "loss": 2.1468, "step": 7305 }, { "epoch": 1.24, "grad_norm": 0.2236328125, "learning_rate": 7.612474029223866e-05, "loss": 2.1352, "step": 7310 }, { "epoch": 1.24, "grad_norm": 0.2314453125, "learning_rate": 7.598122925025985e-05, "loss": 2.1398, "step": 7315 }, { "epoch": 1.24, "grad_norm": 0.220703125, "learning_rate": 7.583777068529209e-05, "loss": 2.1497, "step": 7320 }, { "epoch": 1.24, "grad_norm": 0.234375, "learning_rate": 7.569436491076842e-05, "loss": 2.1127, "step": 7325 }, { "epoch": 1.24, "grad_norm": 0.23046875, "learning_rate": 7.55510122400066e-05, "loss": 2.1145, "step": 7330 }, { "epoch": 1.24, "grad_norm": 0.2353515625, "learning_rate": 7.540771298620826e-05, "loss": 2.1487, "step": 7335 }, { "epoch": 1.24, "grad_norm": 0.22265625, "learning_rate": 7.526446746245843e-05, "loss": 2.122, "step": 7340 }, { "epoch": 1.24, "grad_norm": 0.22265625, "learning_rate": 7.512127598172471e-05, "loss": 2.131, "step": 7345 }, { "epoch": 1.24, "grad_norm": 0.22265625, "learning_rate": 7.497813885685661e-05, "loss": 2.1383, "step": 7350 }, { "epoch": 1.25, "grad_norm": 0.2314453125, "learning_rate": 7.483505640058488e-05, "loss": 2.1283, "step": 7355 }, { "epoch": 1.25, "grad_norm": 0.2333984375, "learning_rate": 7.469202892552088e-05, "loss": 2.1387, "step": 7360 }, { "epoch": 1.25, "grad_norm": 0.2275390625, "learning_rate": 7.454905674415575e-05, "loss": 2.1122, "step": 7365 }, { "epoch": 1.25, "grad_norm": 0.232421875, "learning_rate": 7.440614016885996e-05, "loss": 2.121, "step": 7370 }, { "epoch": 1.25, "grad_norm": 0.2236328125, "learning_rate": 7.426327951188227e-05, "loss": 2.1497, "step": 7375 }, { "epoch": 1.25, "grad_norm": 0.224609375, "learning_rate": 7.412047508534953e-05, "loss": 2.1219, "step": 7380 }, { "epoch": 1.25, "grad_norm": 0.232421875, "learning_rate": 7.397772720126561e-05, "loss": 2.1193, "step": 7385 }, { "epoch": 1.25, "grad_norm": 0.228515625, "learning_rate": 7.383503617151075e-05, "loss": 2.0977, "step": 7390 }, { "epoch": 1.25, "grad_norm": 0.2265625, "learning_rate": 7.369240230784115e-05, "loss": 2.112, "step": 7395 }, { "epoch": 1.25, "grad_norm": 0.23046875, "learning_rate": 7.354982592188803e-05, "loss": 2.1423, "step": 7400 }, { "epoch": 1.25, "grad_norm": 0.22265625, "learning_rate": 7.340730732515696e-05, "loss": 2.1275, "step": 7405 }, { "epoch": 1.25, "grad_norm": 0.23046875, "learning_rate": 7.326484682902739e-05, "loss": 2.1446, "step": 7410 }, { "epoch": 1.26, "grad_norm": 0.240234375, "learning_rate": 7.312244474475178e-05, "loss": 2.1214, "step": 7415 }, { "epoch": 1.26, "grad_norm": 0.2275390625, "learning_rate": 7.298010138345485e-05, "loss": 2.131, "step": 7420 }, { "epoch": 1.26, "grad_norm": 0.2158203125, "learning_rate": 7.283781705613323e-05, "loss": 2.07, "step": 7425 }, { "epoch": 1.26, "grad_norm": 0.2265625, "learning_rate": 7.26955920736544e-05, "loss": 2.115, "step": 7430 }, { "epoch": 1.26, "grad_norm": 0.2333984375, "learning_rate": 7.255342674675625e-05, "loss": 2.0861, "step": 7435 }, { "epoch": 1.26, "grad_norm": 0.228515625, "learning_rate": 7.241132138604634e-05, "loss": 2.1605, "step": 7440 }, { "epoch": 1.26, "grad_norm": 0.232421875, "learning_rate": 7.226927630200117e-05, "loss": 2.1492, "step": 7445 }, { "epoch": 1.26, "grad_norm": 0.228515625, "learning_rate": 7.212729180496563e-05, "loss": 2.1075, "step": 7450 }, { "epoch": 1.26, "grad_norm": 0.232421875, "learning_rate": 7.198536820515214e-05, "loss": 2.1189, "step": 7455 }, { "epoch": 1.26, "grad_norm": 0.2275390625, "learning_rate": 7.18435058126401e-05, "loss": 2.0948, "step": 7460 }, { "epoch": 1.26, "grad_norm": 0.2197265625, "learning_rate": 7.170170493737522e-05, "loss": 2.1453, "step": 7465 }, { "epoch": 1.27, "grad_norm": 0.2255859375, "learning_rate": 7.155996588916883e-05, "loss": 2.1092, "step": 7470 }, { "epoch": 1.27, "grad_norm": 0.2265625, "learning_rate": 7.141828897769701e-05, "loss": 2.1437, "step": 7475 }, { "epoch": 1.27, "grad_norm": 0.2255859375, "learning_rate": 7.127667451250031e-05, "loss": 2.1338, "step": 7480 }, { "epoch": 1.27, "grad_norm": 0.23046875, "learning_rate": 7.113512280298264e-05, "loss": 2.1306, "step": 7485 }, { "epoch": 1.27, "grad_norm": 0.2236328125, "learning_rate": 7.099363415841097e-05, "loss": 2.1019, "step": 7490 }, { "epoch": 1.27, "grad_norm": 0.2255859375, "learning_rate": 7.085220888791439e-05, "loss": 2.0879, "step": 7495 }, { "epoch": 1.27, "grad_norm": 0.2314453125, "learning_rate": 7.071084730048352e-05, "loss": 2.1013, "step": 7500 }, { "epoch": 1.27, "grad_norm": 0.2314453125, "learning_rate": 7.056954970496988e-05, "loss": 2.1492, "step": 7505 }, { "epoch": 1.27, "grad_norm": 0.248046875, "learning_rate": 7.042831641008518e-05, "loss": 2.1336, "step": 7510 }, { "epoch": 1.27, "grad_norm": 0.22265625, "learning_rate": 7.028714772440061e-05, "loss": 2.1679, "step": 7515 }, { "epoch": 1.27, "grad_norm": 0.224609375, "learning_rate": 7.014604395634623e-05, "loss": 2.122, "step": 7520 }, { "epoch": 1.27, "grad_norm": 0.2255859375, "learning_rate": 7.000500541421028e-05, "loss": 2.1175, "step": 7525 }, { "epoch": 1.28, "grad_norm": 0.2216796875, "learning_rate": 6.986403240613844e-05, "loss": 2.1061, "step": 7530 }, { "epoch": 1.28, "grad_norm": 0.22265625, "learning_rate": 6.972312524013323e-05, "loss": 2.1216, "step": 7535 }, { "epoch": 1.28, "grad_norm": 0.2265625, "learning_rate": 6.958228422405335e-05, "loss": 2.1641, "step": 7540 }, { "epoch": 1.28, "grad_norm": 0.2265625, "learning_rate": 6.944150966561294e-05, "loss": 2.1464, "step": 7545 }, { "epoch": 1.28, "grad_norm": 0.2177734375, "learning_rate": 6.930080187238095e-05, "loss": 2.121, "step": 7550 }, { "epoch": 1.28, "grad_norm": 0.23046875, "learning_rate": 6.916016115178043e-05, "loss": 2.1218, "step": 7555 }, { "epoch": 1.28, "grad_norm": 0.236328125, "learning_rate": 6.901958781108794e-05, "loss": 2.1254, "step": 7560 }, { "epoch": 1.28, "grad_norm": 0.224609375, "learning_rate": 6.887908215743282e-05, "loss": 2.1073, "step": 7565 }, { "epoch": 1.28, "grad_norm": 0.23046875, "learning_rate": 6.873864449779646e-05, "loss": 2.1278, "step": 7570 }, { "epoch": 1.28, "grad_norm": 0.220703125, "learning_rate": 6.859827513901178e-05, "loss": 2.0982, "step": 7575 }, { "epoch": 1.28, "grad_norm": 0.2216796875, "learning_rate": 6.845797438776241e-05, "loss": 2.1182, "step": 7580 }, { "epoch": 1.28, "grad_norm": 0.2255859375, "learning_rate": 6.831774255058212e-05, "loss": 2.1411, "step": 7585 }, { "epoch": 1.29, "grad_norm": 0.2265625, "learning_rate": 6.81775799338541e-05, "loss": 2.149, "step": 7590 }, { "epoch": 1.29, "grad_norm": 0.2216796875, "learning_rate": 6.803748684381031e-05, "loss": 2.1647, "step": 7595 }, { "epoch": 1.29, "grad_norm": 0.2294921875, "learning_rate": 6.78974635865308e-05, "loss": 2.0988, "step": 7600 }, { "epoch": 1.29, "grad_norm": 0.224609375, "learning_rate": 6.775751046794308e-05, "loss": 2.0968, "step": 7605 }, { "epoch": 1.29, "grad_norm": 0.22265625, "learning_rate": 6.761762779382131e-05, "loss": 2.1752, "step": 7610 }, { "epoch": 1.29, "grad_norm": 0.23046875, "learning_rate": 6.747781586978589e-05, "loss": 2.1605, "step": 7615 }, { "epoch": 1.29, "grad_norm": 0.2294921875, "learning_rate": 6.73380750013026e-05, "loss": 2.1392, "step": 7620 }, { "epoch": 1.29, "grad_norm": 0.2275390625, "learning_rate": 6.719840549368183e-05, "loss": 2.1066, "step": 7625 }, { "epoch": 1.29, "grad_norm": 0.2373046875, "learning_rate": 6.705880765207825e-05, "loss": 2.0877, "step": 7630 }, { "epoch": 1.29, "grad_norm": 0.2265625, "learning_rate": 6.691928178148995e-05, "loss": 2.11, "step": 7635 }, { "epoch": 1.29, "grad_norm": 0.23828125, "learning_rate": 6.677982818675758e-05, "loss": 2.1526, "step": 7640 }, { "epoch": 1.29, "grad_norm": 0.23046875, "learning_rate": 6.664044717256402e-05, "loss": 2.0917, "step": 7645 }, { "epoch": 1.3, "grad_norm": 0.2255859375, "learning_rate": 6.650113904343366e-05, "loss": 2.1098, "step": 7650 }, { "epoch": 1.3, "grad_norm": 0.2275390625, "learning_rate": 6.636190410373143e-05, "loss": 2.128, "step": 7655 }, { "epoch": 1.3, "grad_norm": 0.2236328125, "learning_rate": 6.622274265766253e-05, "loss": 2.1164, "step": 7660 }, { "epoch": 1.3, "grad_norm": 0.2265625, "learning_rate": 6.608365500927148e-05, "loss": 2.0702, "step": 7665 }, { "epoch": 1.3, "grad_norm": 0.2373046875, "learning_rate": 6.594464146244165e-05, "loss": 2.1779, "step": 7670 }, { "epoch": 1.3, "grad_norm": 0.2294921875, "learning_rate": 6.580570232089449e-05, "loss": 2.0749, "step": 7675 }, { "epoch": 1.3, "grad_norm": 0.234375, "learning_rate": 6.56668378881888e-05, "loss": 2.1277, "step": 7680 }, { "epoch": 1.3, "grad_norm": 0.2197265625, "learning_rate": 6.552804846772026e-05, "loss": 2.1526, "step": 7685 }, { "epoch": 1.3, "grad_norm": 0.21875, "learning_rate": 6.538933436272065e-05, "loss": 2.1642, "step": 7690 }, { "epoch": 1.3, "grad_norm": 0.224609375, "learning_rate": 6.525069587625712e-05, "loss": 2.1258, "step": 7695 }, { "epoch": 1.3, "grad_norm": 0.2314453125, "learning_rate": 6.511213331123168e-05, "loss": 2.1485, "step": 7700 }, { "epoch": 1.3, "grad_norm": 0.2265625, "learning_rate": 6.497364697038047e-05, "loss": 2.11, "step": 7705 }, { "epoch": 1.31, "grad_norm": 0.2255859375, "learning_rate": 6.483523715627301e-05, "loss": 2.1677, "step": 7710 }, { "epoch": 1.31, "grad_norm": 0.2265625, "learning_rate": 6.469690417131171e-05, "loss": 2.1133, "step": 7715 }, { "epoch": 1.31, "grad_norm": 0.228515625, "learning_rate": 6.455864831773108e-05, "loss": 2.1201, "step": 7720 }, { "epoch": 1.31, "grad_norm": 0.2216796875, "learning_rate": 6.442046989759712e-05, "loss": 2.0895, "step": 7725 }, { "epoch": 1.31, "grad_norm": 0.22265625, "learning_rate": 6.428236921280666e-05, "loss": 2.0973, "step": 7730 }, { "epoch": 1.31, "grad_norm": 0.2314453125, "learning_rate": 6.414434656508665e-05, "loss": 2.1285, "step": 7735 }, { "epoch": 1.31, "grad_norm": 0.228515625, "learning_rate": 6.400640225599358e-05, "loss": 2.1108, "step": 7740 }, { "epoch": 1.31, "grad_norm": 0.2236328125, "learning_rate": 6.386853658691281e-05, "loss": 2.1164, "step": 7745 }, { "epoch": 1.31, "grad_norm": 0.23046875, "learning_rate": 6.373074985905781e-05, "loss": 2.1695, "step": 7750 }, { "epoch": 1.31, "grad_norm": 0.224609375, "learning_rate": 6.359304237346961e-05, "loss": 2.0999, "step": 7755 }, { "epoch": 1.31, "grad_norm": 0.2275390625, "learning_rate": 6.345541443101616e-05, "loss": 2.1377, "step": 7760 }, { "epoch": 1.32, "grad_norm": 0.228515625, "learning_rate": 6.331786633239154e-05, "loss": 2.113, "step": 7765 }, { "epoch": 1.32, "grad_norm": 0.2265625, "learning_rate": 6.318039837811542e-05, "loss": 2.1612, "step": 7770 }, { "epoch": 1.32, "grad_norm": 0.2314453125, "learning_rate": 6.304301086853243e-05, "loss": 2.1783, "step": 7775 }, { "epoch": 1.32, "grad_norm": 0.2333984375, "learning_rate": 6.290570410381129e-05, "loss": 2.1309, "step": 7780 }, { "epoch": 1.32, "grad_norm": 0.2353515625, "learning_rate": 6.276847838394446e-05, "loss": 2.0939, "step": 7785 }, { "epoch": 1.32, "grad_norm": 0.2353515625, "learning_rate": 6.263133400874725e-05, "loss": 2.1013, "step": 7790 }, { "epoch": 1.32, "grad_norm": 0.2216796875, "learning_rate": 6.249427127785724e-05, "loss": 2.1307, "step": 7795 }, { "epoch": 1.32, "grad_norm": 0.2294921875, "learning_rate": 6.235729049073371e-05, "loss": 2.0944, "step": 7800 }, { "epoch": 1.32, "grad_norm": 0.2255859375, "learning_rate": 6.222039194665678e-05, "loss": 2.0731, "step": 7805 }, { "epoch": 1.32, "grad_norm": 0.224609375, "learning_rate": 6.2083575944727e-05, "loss": 2.1293, "step": 7810 }, { "epoch": 1.32, "grad_norm": 0.22265625, "learning_rate": 6.194684278386455e-05, "loss": 2.1658, "step": 7815 }, { "epoch": 1.32, "grad_norm": 0.2236328125, "learning_rate": 6.18101927628085e-05, "loss": 2.0989, "step": 7820 }, { "epoch": 1.33, "grad_norm": 0.2255859375, "learning_rate": 6.167362618011648e-05, "loss": 2.1014, "step": 7825 }, { "epoch": 1.33, "grad_norm": 0.2275390625, "learning_rate": 6.153714333416372e-05, "loss": 2.1117, "step": 7830 }, { "epoch": 1.33, "grad_norm": 0.2294921875, "learning_rate": 6.140074452314236e-05, "loss": 2.0891, "step": 7835 }, { "epoch": 1.33, "grad_norm": 0.2265625, "learning_rate": 6.126443004506122e-05, "loss": 2.0833, "step": 7840 }, { "epoch": 1.33, "grad_norm": 0.228515625, "learning_rate": 6.112820019774461e-05, "loss": 2.1394, "step": 7845 }, { "epoch": 1.33, "grad_norm": 0.224609375, "learning_rate": 6.099205527883207e-05, "loss": 2.1298, "step": 7850 }, { "epoch": 1.33, "grad_norm": 0.224609375, "learning_rate": 6.0855995585777616e-05, "loss": 2.1394, "step": 7855 }, { "epoch": 1.33, "grad_norm": 0.2265625, "learning_rate": 6.072002141584891e-05, "loss": 2.1386, "step": 7860 }, { "epoch": 1.33, "grad_norm": 0.2265625, "learning_rate": 6.058413306612689e-05, "loss": 2.1127, "step": 7865 }, { "epoch": 1.33, "grad_norm": 0.2333984375, "learning_rate": 6.044833083350503e-05, "loss": 2.1322, "step": 7870 }, { "epoch": 1.33, "grad_norm": 0.2177734375, "learning_rate": 6.0312615014688436e-05, "loss": 2.1384, "step": 7875 }, { "epoch": 1.33, "grad_norm": 0.2236328125, "learning_rate": 6.017698590619362e-05, "loss": 2.1268, "step": 7880 }, { "epoch": 1.34, "grad_norm": 0.224609375, "learning_rate": 6.004144380434763e-05, "loss": 2.129, "step": 7885 }, { "epoch": 1.34, "grad_norm": 0.2255859375, "learning_rate": 5.9905989005287277e-05, "loss": 2.1109, "step": 7890 }, { "epoch": 1.34, "grad_norm": 0.228515625, "learning_rate": 5.977062180495876e-05, "loss": 2.1361, "step": 7895 }, { "epoch": 1.34, "grad_norm": 0.2216796875, "learning_rate": 5.96353424991169e-05, "loss": 2.1141, "step": 7900 }, { "epoch": 1.34, "grad_norm": 0.220703125, "learning_rate": 5.950015138332434e-05, "loss": 2.1336, "step": 7905 }, { "epoch": 1.34, "grad_norm": 0.2275390625, "learning_rate": 5.9365048752951225e-05, "loss": 2.1268, "step": 7910 }, { "epoch": 1.34, "grad_norm": 0.23046875, "learning_rate": 5.923003490317422e-05, "loss": 2.1146, "step": 7915 }, { "epoch": 1.34, "grad_norm": 0.228515625, "learning_rate": 5.9095110128976104e-05, "loss": 2.1023, "step": 7920 }, { "epoch": 1.34, "grad_norm": 0.2333984375, "learning_rate": 5.8960274725145056e-05, "loss": 2.1159, "step": 7925 }, { "epoch": 1.34, "grad_norm": 0.23046875, "learning_rate": 5.882552898627391e-05, "loss": 2.0938, "step": 7930 }, { "epoch": 1.34, "grad_norm": 0.2294921875, "learning_rate": 5.8690873206759675e-05, "loss": 2.0999, "step": 7935 }, { "epoch": 1.34, "grad_norm": 0.2177734375, "learning_rate": 5.8556307680802826e-05, "loss": 2.0965, "step": 7940 }, { "epoch": 1.35, "grad_norm": 0.224609375, "learning_rate": 5.842183270240652e-05, "loss": 2.0736, "step": 7945 }, { "epoch": 1.35, "grad_norm": 0.234375, "learning_rate": 5.8287448565376215e-05, "loss": 2.1204, "step": 7950 }, { "epoch": 1.35, "grad_norm": 0.2255859375, "learning_rate": 5.8153155563318904e-05, "loss": 2.1253, "step": 7955 }, { "epoch": 1.35, "grad_norm": 0.22265625, "learning_rate": 5.801895398964234e-05, "loss": 2.1087, "step": 7960 }, { "epoch": 1.35, "grad_norm": 0.2265625, "learning_rate": 5.788484413755469e-05, "loss": 2.0863, "step": 7965 }, { "epoch": 1.35, "grad_norm": 0.22265625, "learning_rate": 5.7750826300063496e-05, "loss": 2.1233, "step": 7970 }, { "epoch": 1.35, "grad_norm": 0.2294921875, "learning_rate": 5.761690076997543e-05, "loss": 2.1237, "step": 7975 }, { "epoch": 1.35, "grad_norm": 0.2255859375, "learning_rate": 5.7483067839895585e-05, "loss": 2.1592, "step": 7980 }, { "epoch": 1.35, "grad_norm": 0.22265625, "learning_rate": 5.7349327802226474e-05, "loss": 2.1362, "step": 7985 }, { "epoch": 1.35, "grad_norm": 0.2197265625, "learning_rate": 5.721568094916783e-05, "loss": 2.106, "step": 7990 }, { "epoch": 1.35, "grad_norm": 0.2255859375, "learning_rate": 5.7082127572715785e-05, "loss": 2.1259, "step": 7995 }, { "epoch": 1.35, "grad_norm": 0.234375, "learning_rate": 5.6948667964662136e-05, "loss": 2.1102, "step": 8000 }, { "epoch": 1.36, "grad_norm": 0.2265625, "learning_rate": 5.6815302416593894e-05, "loss": 2.1031, "step": 8005 }, { "epoch": 1.36, "grad_norm": 0.22265625, "learning_rate": 5.668203121989266e-05, "loss": 2.1164, "step": 8010 }, { "epoch": 1.36, "grad_norm": 0.23046875, "learning_rate": 5.6548854665733674e-05, "loss": 2.1152, "step": 8015 }, { "epoch": 1.36, "grad_norm": 0.224609375, "learning_rate": 5.641577304508559e-05, "loss": 2.1385, "step": 8020 }, { "epoch": 1.36, "grad_norm": 0.2294921875, "learning_rate": 5.6282786648709484e-05, "loss": 2.111, "step": 8025 }, { "epoch": 1.36, "grad_norm": 0.2314453125, "learning_rate": 5.614989576715852e-05, "loss": 2.1329, "step": 8030 }, { "epoch": 1.36, "grad_norm": 0.2216796875, "learning_rate": 5.601710069077712e-05, "loss": 2.1403, "step": 8035 }, { "epoch": 1.36, "grad_norm": 0.224609375, "learning_rate": 5.58844017097004e-05, "loss": 2.1114, "step": 8040 }, { "epoch": 1.36, "grad_norm": 0.228515625, "learning_rate": 5.575179911385349e-05, "loss": 2.1271, "step": 8045 }, { "epoch": 1.36, "grad_norm": 0.224609375, "learning_rate": 5.561929319295104e-05, "loss": 2.1481, "step": 8050 }, { "epoch": 1.36, "grad_norm": 0.2314453125, "learning_rate": 5.5486884236496303e-05, "loss": 2.1358, "step": 8055 }, { "epoch": 1.37, "grad_norm": 0.228515625, "learning_rate": 5.535457253378082e-05, "loss": 2.137, "step": 8060 }, { "epoch": 1.37, "grad_norm": 0.2333984375, "learning_rate": 5.522235837388362e-05, "loss": 2.1403, "step": 8065 }, { "epoch": 1.37, "grad_norm": 0.2314453125, "learning_rate": 5.5090242045670605e-05, "loss": 2.1279, "step": 8070 }, { "epoch": 1.37, "grad_norm": 0.2236328125, "learning_rate": 5.495822383779392e-05, "loss": 2.1185, "step": 8075 }, { "epoch": 1.37, "grad_norm": 0.232421875, "learning_rate": 5.48263040386914e-05, "loss": 2.1356, "step": 8080 }, { "epoch": 1.37, "grad_norm": 0.2255859375, "learning_rate": 5.469448293658574e-05, "loss": 2.1376, "step": 8085 }, { "epoch": 1.37, "grad_norm": 0.22265625, "learning_rate": 5.4562760819484125e-05, "loss": 2.1191, "step": 8090 }, { "epoch": 1.37, "grad_norm": 0.224609375, "learning_rate": 5.443113797517741e-05, "loss": 2.0956, "step": 8095 }, { "epoch": 1.37, "grad_norm": 0.2314453125, "learning_rate": 5.4299614691239576e-05, "loss": 2.1205, "step": 8100 }, { "epoch": 1.37, "grad_norm": 0.2353515625, "learning_rate": 5.416819125502712e-05, "loss": 2.1297, "step": 8105 }, { "epoch": 1.37, "grad_norm": 0.2255859375, "learning_rate": 5.4036867953678286e-05, "loss": 2.1068, "step": 8110 }, { "epoch": 1.37, "grad_norm": 0.2255859375, "learning_rate": 5.390564507411261e-05, "loss": 2.1027, "step": 8115 }, { "epoch": 1.38, "grad_norm": 0.232421875, "learning_rate": 5.377452290303023e-05, "loss": 2.1181, "step": 8120 }, { "epoch": 1.38, "grad_norm": 0.2255859375, "learning_rate": 5.364350172691124e-05, "loss": 2.1774, "step": 8125 }, { "epoch": 1.38, "grad_norm": 0.2294921875, "learning_rate": 5.3512581832015075e-05, "loss": 2.099, "step": 8130 }, { "epoch": 1.38, "grad_norm": 0.22265625, "learning_rate": 5.3381763504379914e-05, "loss": 2.1234, "step": 8135 }, { "epoch": 1.38, "grad_norm": 0.23046875, "learning_rate": 5.325104702982192e-05, "loss": 2.1567, "step": 8140 }, { "epoch": 1.38, "grad_norm": 0.2236328125, "learning_rate": 5.3120432693934894e-05, "loss": 2.149, "step": 8145 }, { "epoch": 1.38, "grad_norm": 0.2275390625, "learning_rate": 5.2989920782089265e-05, "loss": 2.1027, "step": 8150 }, { "epoch": 1.38, "grad_norm": 0.2275390625, "learning_rate": 5.2859511579431944e-05, "loss": 2.1403, "step": 8155 }, { "epoch": 1.38, "grad_norm": 0.232421875, "learning_rate": 5.272920537088528e-05, "loss": 2.1336, "step": 8160 }, { "epoch": 1.38, "grad_norm": 0.2265625, "learning_rate": 5.259900244114655e-05, "loss": 2.1591, "step": 8165 }, { "epoch": 1.38, "grad_norm": 0.2265625, "learning_rate": 5.2468903074687506e-05, "loss": 2.1639, "step": 8170 }, { "epoch": 1.38, "grad_norm": 0.2294921875, "learning_rate": 5.233890755575361e-05, "loss": 2.1787, "step": 8175 }, { "epoch": 1.39, "grad_norm": 0.2236328125, "learning_rate": 5.22090161683633e-05, "loss": 2.1079, "step": 8180 }, { "epoch": 1.39, "grad_norm": 0.232421875, "learning_rate": 5.207922919630771e-05, "loss": 2.1277, "step": 8185 }, { "epoch": 1.39, "grad_norm": 0.224609375, "learning_rate": 5.194954692314975e-05, "loss": 2.1226, "step": 8190 }, { "epoch": 1.39, "grad_norm": 0.2236328125, "learning_rate": 5.1819969632223505e-05, "loss": 2.1081, "step": 8195 }, { "epoch": 1.39, "grad_norm": 0.2236328125, "learning_rate": 5.1690497606633824e-05, "loss": 2.1174, "step": 8200 }, { "epoch": 1.39, "grad_norm": 0.2177734375, "learning_rate": 5.156113112925543e-05, "loss": 2.1002, "step": 8205 }, { "epoch": 1.39, "grad_norm": 0.2255859375, "learning_rate": 5.1431870482732516e-05, "loss": 2.1494, "step": 8210 }, { "epoch": 1.39, "grad_norm": 0.22265625, "learning_rate": 5.1302715949478174e-05, "loss": 2.1323, "step": 8215 }, { "epoch": 1.39, "grad_norm": 0.220703125, "learning_rate": 5.117366781167341e-05, "loss": 2.15, "step": 8220 }, { "epoch": 1.39, "grad_norm": 0.2314453125, "learning_rate": 5.104472635126695e-05, "loss": 2.1167, "step": 8225 }, { "epoch": 1.39, "grad_norm": 0.2216796875, "learning_rate": 5.091589184997441e-05, "loss": 2.1579, "step": 8230 }, { "epoch": 1.39, "grad_norm": 0.2265625, "learning_rate": 5.0787164589277645e-05, "loss": 2.1174, "step": 8235 }, { "epoch": 1.4, "grad_norm": 0.2265625, "learning_rate": 5.0658544850424274e-05, "loss": 2.1221, "step": 8240 }, { "epoch": 1.4, "grad_norm": 0.236328125, "learning_rate": 5.053003291442707e-05, "loss": 2.1003, "step": 8245 }, { "epoch": 1.4, "grad_norm": 0.23046875, "learning_rate": 5.0401629062063115e-05, "loss": 2.1398, "step": 8250 }, { "epoch": 1.4, "grad_norm": 0.2412109375, "learning_rate": 5.027333357387345e-05, "loss": 2.1235, "step": 8255 }, { "epoch": 1.4, "grad_norm": 0.224609375, "learning_rate": 5.014514673016237e-05, "loss": 2.1306, "step": 8260 }, { "epoch": 1.4, "grad_norm": 0.2236328125, "learning_rate": 5.00170688109967e-05, "loss": 2.1229, "step": 8265 }, { "epoch": 1.4, "grad_norm": 0.228515625, "learning_rate": 4.988910009620537e-05, "loss": 2.1448, "step": 8270 }, { "epoch": 1.4, "grad_norm": 0.224609375, "learning_rate": 4.976124086537871e-05, "loss": 2.1072, "step": 8275 }, { "epoch": 1.4, "grad_norm": 0.2265625, "learning_rate": 4.9633491397867815e-05, "loss": 2.0999, "step": 8280 }, { "epoch": 1.4, "grad_norm": 0.2275390625, "learning_rate": 4.950585197278404e-05, "loss": 2.1003, "step": 8285 }, { "epoch": 1.4, "grad_norm": 0.220703125, "learning_rate": 4.937832286899815e-05, "loss": 2.0978, "step": 8290 }, { "epoch": 1.4, "grad_norm": 0.2275390625, "learning_rate": 4.925090436514004e-05, "loss": 2.184, "step": 8295 }, { "epoch": 1.41, "grad_norm": 0.240234375, "learning_rate": 4.91235967395979e-05, "loss": 2.14, "step": 8300 }, { "epoch": 1.41, "grad_norm": 0.22265625, "learning_rate": 4.8996400270517675e-05, "loss": 2.1209, "step": 8305 }, { "epoch": 1.41, "grad_norm": 0.2255859375, "learning_rate": 4.886931523580246e-05, "loss": 2.1202, "step": 8310 }, { "epoch": 1.41, "grad_norm": 0.228515625, "learning_rate": 4.87423419131119e-05, "loss": 2.1826, "step": 8315 }, { "epoch": 1.41, "grad_norm": 0.2294921875, "learning_rate": 4.861548057986147e-05, "loss": 2.1141, "step": 8320 }, { "epoch": 1.41, "grad_norm": 0.228515625, "learning_rate": 4.848873151322209e-05, "loss": 2.1506, "step": 8325 }, { "epoch": 1.41, "grad_norm": 0.2275390625, "learning_rate": 4.836209499011932e-05, "loss": 2.1256, "step": 8330 }, { "epoch": 1.41, "grad_norm": 0.228515625, "learning_rate": 4.823557128723288e-05, "loss": 2.1182, "step": 8335 }, { "epoch": 1.41, "grad_norm": 0.2333984375, "learning_rate": 4.810916068099601e-05, "loss": 2.1319, "step": 8340 }, { "epoch": 1.41, "grad_norm": 0.2275390625, "learning_rate": 4.798286344759475e-05, "loss": 2.1291, "step": 8345 }, { "epoch": 1.41, "grad_norm": 0.2216796875, "learning_rate": 4.7856679862967515e-05, "loss": 2.0805, "step": 8350 }, { "epoch": 1.42, "grad_norm": 0.2255859375, "learning_rate": 4.773061020280443e-05, "loss": 2.1223, "step": 8355 }, { "epoch": 1.42, "grad_norm": 0.2255859375, "learning_rate": 4.760465474254667e-05, "loss": 2.1401, "step": 8360 }, { "epoch": 1.42, "grad_norm": 0.234375, "learning_rate": 4.7478813757385954e-05, "loss": 2.1489, "step": 8365 }, { "epoch": 1.42, "grad_norm": 0.2294921875, "learning_rate": 4.735308752226387e-05, "loss": 2.1411, "step": 8370 }, { "epoch": 1.42, "grad_norm": 0.23046875, "learning_rate": 4.722747631187123e-05, "loss": 2.1452, "step": 8375 }, { "epoch": 1.42, "grad_norm": 0.2255859375, "learning_rate": 4.710198040064767e-05, "loss": 2.1107, "step": 8380 }, { "epoch": 1.42, "grad_norm": 0.2275390625, "learning_rate": 4.697660006278073e-05, "loss": 2.1218, "step": 8385 }, { "epoch": 1.42, "grad_norm": 0.2265625, "learning_rate": 4.6851335572205646e-05, "loss": 2.1221, "step": 8390 }, { "epoch": 1.42, "grad_norm": 0.236328125, "learning_rate": 4.6726187202604465e-05, "loss": 2.148, "step": 8395 }, { "epoch": 1.42, "grad_norm": 0.2314453125, "learning_rate": 4.6601155227405436e-05, "loss": 2.1665, "step": 8400 }, { "epoch": 1.42, "grad_norm": 0.23828125, "learning_rate": 4.6476239919782636e-05, "loss": 2.1232, "step": 8405 }, { "epoch": 1.42, "grad_norm": 0.2265625, "learning_rate": 4.635144155265523e-05, "loss": 2.1338, "step": 8410 }, { "epoch": 1.43, "grad_norm": 0.220703125, "learning_rate": 4.622676039868672e-05, "loss": 2.1274, "step": 8415 }, { "epoch": 1.43, "grad_norm": 0.21875, "learning_rate": 4.6102196730284786e-05, "loss": 2.1651, "step": 8420 }, { "epoch": 1.43, "grad_norm": 0.2333984375, "learning_rate": 4.597775081960026e-05, "loss": 2.1164, "step": 8425 }, { "epoch": 1.43, "grad_norm": 0.220703125, "learning_rate": 4.585342293852666e-05, "loss": 2.1234, "step": 8430 }, { "epoch": 1.43, "grad_norm": 0.234375, "learning_rate": 4.572921335869974e-05, "loss": 2.1105, "step": 8435 }, { "epoch": 1.43, "grad_norm": 0.2265625, "learning_rate": 4.560512235149668e-05, "loss": 2.1434, "step": 8440 }, { "epoch": 1.43, "grad_norm": 0.228515625, "learning_rate": 4.5481150188035626e-05, "loss": 2.0948, "step": 8445 }, { "epoch": 1.43, "grad_norm": 0.23046875, "learning_rate": 4.535729713917522e-05, "loss": 2.1562, "step": 8450 }, { "epoch": 1.43, "grad_norm": 0.2255859375, "learning_rate": 4.5233563475513616e-05, "loss": 2.1353, "step": 8455 }, { "epoch": 1.43, "grad_norm": 0.2275390625, "learning_rate": 4.510994946738829e-05, "loss": 2.1399, "step": 8460 }, { "epoch": 1.43, "grad_norm": 0.232421875, "learning_rate": 4.498645538487528e-05, "loss": 2.1196, "step": 8465 }, { "epoch": 1.43, "grad_norm": 0.224609375, "learning_rate": 4.4863081497788506e-05, "loss": 2.0936, "step": 8470 }, { "epoch": 1.44, "grad_norm": 0.2333984375, "learning_rate": 4.473982807567937e-05, "loss": 2.1093, "step": 8475 }, { "epoch": 1.44, "grad_norm": 0.2255859375, "learning_rate": 4.4616695387836074e-05, "loss": 2.1156, "step": 8480 }, { "epoch": 1.44, "grad_norm": 0.2265625, "learning_rate": 4.449368370328302e-05, "loss": 2.106, "step": 8485 }, { "epoch": 1.44, "grad_norm": 0.2294921875, "learning_rate": 4.4370793290780224e-05, "loss": 2.1387, "step": 8490 }, { "epoch": 1.44, "grad_norm": 0.2236328125, "learning_rate": 4.42480244188228e-05, "loss": 2.1202, "step": 8495 }, { "epoch": 1.44, "grad_norm": 0.2353515625, "learning_rate": 4.412537735564019e-05, "loss": 2.1336, "step": 8500 }, { "epoch": 1.44, "grad_norm": 0.2294921875, "learning_rate": 4.4002852369195845e-05, "loss": 2.1211, "step": 8505 }, { "epoch": 1.44, "grad_norm": 0.2265625, "learning_rate": 4.3880449727186427e-05, "loss": 2.1334, "step": 8510 }, { "epoch": 1.44, "grad_norm": 0.234375, "learning_rate": 4.375816969704131e-05, "loss": 2.1229, "step": 8515 }, { "epoch": 1.44, "grad_norm": 0.22265625, "learning_rate": 4.363601254592201e-05, "loss": 2.1147, "step": 8520 }, { "epoch": 1.44, "grad_norm": 0.224609375, "learning_rate": 4.3513978540721477e-05, "loss": 2.1554, "step": 8525 }, { "epoch": 1.44, "grad_norm": 0.23046875, "learning_rate": 4.339206794806371e-05, "loss": 2.1565, "step": 8530 }, { "epoch": 1.45, "grad_norm": 0.2236328125, "learning_rate": 4.327028103430303e-05, "loss": 2.1332, "step": 8535 }, { "epoch": 1.45, "grad_norm": 0.2255859375, "learning_rate": 4.3148618065523546e-05, "loss": 2.1234, "step": 8540 }, { "epoch": 1.45, "grad_norm": 0.2255859375, "learning_rate": 4.3027079307538584e-05, "loss": 2.1442, "step": 8545 }, { "epoch": 1.45, "grad_norm": 0.248046875, "learning_rate": 4.290566502589011e-05, "loss": 2.1679, "step": 8550 }, { "epoch": 1.45, "grad_norm": 0.234375, "learning_rate": 4.2784375485848e-05, "loss": 2.113, "step": 8555 }, { "epoch": 1.45, "grad_norm": 0.2275390625, "learning_rate": 4.266321095240973e-05, "loss": 2.1225, "step": 8560 }, { "epoch": 1.45, "grad_norm": 0.2275390625, "learning_rate": 4.2542171690299605e-05, "loss": 2.0962, "step": 8565 }, { "epoch": 1.45, "grad_norm": 0.2265625, "learning_rate": 4.242125796396827e-05, "loss": 2.1323, "step": 8570 }, { "epoch": 1.45, "grad_norm": 0.228515625, "learning_rate": 4.230047003759206e-05, "loss": 2.1072, "step": 8575 }, { "epoch": 1.45, "grad_norm": 0.228515625, "learning_rate": 4.217980817507242e-05, "loss": 2.132, "step": 8580 }, { "epoch": 1.45, "grad_norm": 0.2314453125, "learning_rate": 4.205927264003544e-05, "loss": 2.1482, "step": 8585 }, { "epoch": 1.45, "grad_norm": 0.2275390625, "learning_rate": 4.193886369583117e-05, "loss": 2.1228, "step": 8590 }, { "epoch": 1.46, "grad_norm": 0.224609375, "learning_rate": 4.1818581605533094e-05, "loss": 2.1229, "step": 8595 }, { "epoch": 1.46, "grad_norm": 0.2197265625, "learning_rate": 4.1698426631937514e-05, "loss": 2.0852, "step": 8600 }, { "epoch": 1.46, "grad_norm": 0.2294921875, "learning_rate": 4.157839903756308e-05, "loss": 2.0963, "step": 8605 }, { "epoch": 1.46, "grad_norm": 0.2265625, "learning_rate": 4.145849908464999e-05, "loss": 2.1152, "step": 8610 }, { "epoch": 1.46, "grad_norm": 0.23046875, "learning_rate": 4.133872703515975e-05, "loss": 2.1029, "step": 8615 }, { "epoch": 1.46, "grad_norm": 0.22265625, "learning_rate": 4.121908315077421e-05, "loss": 2.1612, "step": 8620 }, { "epoch": 1.46, "grad_norm": 0.234375, "learning_rate": 4.1099567692895426e-05, "loss": 2.1364, "step": 8625 }, { "epoch": 1.46, "grad_norm": 0.2265625, "learning_rate": 4.098018092264474e-05, "loss": 2.0914, "step": 8630 }, { "epoch": 1.46, "grad_norm": 0.21875, "learning_rate": 4.08609231008623e-05, "loss": 2.1178, "step": 8635 }, { "epoch": 1.46, "grad_norm": 0.2353515625, "learning_rate": 4.0741794488106585e-05, "loss": 2.1975, "step": 8640 }, { "epoch": 1.46, "grad_norm": 0.2333984375, "learning_rate": 4.0622795344653816e-05, "loss": 2.1351, "step": 8645 }, { "epoch": 1.46, "grad_norm": 0.232421875, "learning_rate": 4.05039259304972e-05, "loss": 2.1472, "step": 8650 }, { "epoch": 1.47, "grad_norm": 0.2294921875, "learning_rate": 4.038518650534661e-05, "loss": 2.1258, "step": 8655 }, { "epoch": 1.47, "grad_norm": 0.2255859375, "learning_rate": 4.0266577328627996e-05, "loss": 2.0783, "step": 8660 }, { "epoch": 1.47, "grad_norm": 0.232421875, "learning_rate": 4.0148098659482537e-05, "loss": 2.1506, "step": 8665 }, { "epoch": 1.47, "grad_norm": 0.232421875, "learning_rate": 4.002975075676641e-05, "loss": 2.1108, "step": 8670 }, { "epoch": 1.47, "grad_norm": 0.2255859375, "learning_rate": 3.991153387905011e-05, "loss": 2.1207, "step": 8675 }, { "epoch": 1.47, "grad_norm": 0.2333984375, "learning_rate": 3.979344828461773e-05, "loss": 2.1169, "step": 8680 }, { "epoch": 1.47, "grad_norm": 0.236328125, "learning_rate": 3.967549423146665e-05, "loss": 2.1205, "step": 8685 }, { "epoch": 1.47, "grad_norm": 0.23046875, "learning_rate": 3.955767197730681e-05, "loss": 2.1345, "step": 8690 }, { "epoch": 1.47, "grad_norm": 0.2265625, "learning_rate": 3.943998177956022e-05, "loss": 2.1559, "step": 8695 }, { "epoch": 1.47, "grad_norm": 0.22265625, "learning_rate": 3.932242389536036e-05, "loss": 2.1094, "step": 8700 }, { "epoch": 1.47, "grad_norm": 0.2255859375, "learning_rate": 3.9204998581551554e-05, "loss": 2.1194, "step": 8705 }, { "epoch": 1.48, "grad_norm": 0.2265625, "learning_rate": 3.908770609468858e-05, "loss": 2.0894, "step": 8710 }, { "epoch": 1.48, "grad_norm": 0.236328125, "learning_rate": 3.897054669103597e-05, "loss": 2.1092, "step": 8715 }, { "epoch": 1.48, "grad_norm": 0.2294921875, "learning_rate": 3.885352062656749e-05, "loss": 2.1491, "step": 8720 }, { "epoch": 1.48, "grad_norm": 0.2255859375, "learning_rate": 3.8736628156965594e-05, "loss": 2.1457, "step": 8725 }, { "epoch": 1.48, "grad_norm": 0.2275390625, "learning_rate": 3.861986953762088e-05, "loss": 2.0965, "step": 8730 }, { "epoch": 1.48, "grad_norm": 0.2265625, "learning_rate": 3.850324502363141e-05, "loss": 2.1247, "step": 8735 }, { "epoch": 1.48, "grad_norm": 0.2236328125, "learning_rate": 3.838675486980232e-05, "loss": 2.1692, "step": 8740 }, { "epoch": 1.48, "grad_norm": 0.2236328125, "learning_rate": 3.8270399330645216e-05, "loss": 2.0999, "step": 8745 }, { "epoch": 1.48, "grad_norm": 0.228515625, "learning_rate": 3.815417866037753e-05, "loss": 2.1126, "step": 8750 }, { "epoch": 1.48, "grad_norm": 0.2236328125, "learning_rate": 3.80380931129221e-05, "loss": 2.1166, "step": 8755 }, { "epoch": 1.48, "grad_norm": 0.21875, "learning_rate": 3.792214294190643e-05, "loss": 2.0955, "step": 8760 }, { "epoch": 1.48, "grad_norm": 0.228515625, "learning_rate": 3.7806328400662374e-05, "loss": 2.1366, "step": 8765 }, { "epoch": 1.49, "grad_norm": 0.2392578125, "learning_rate": 3.769064974222537e-05, "loss": 2.1004, "step": 8770 }, { "epoch": 1.49, "grad_norm": 0.2421875, "learning_rate": 3.757510721933403e-05, "loss": 2.1277, "step": 8775 }, { "epoch": 1.49, "grad_norm": 0.228515625, "learning_rate": 3.74597010844295e-05, "loss": 2.1272, "step": 8780 }, { "epoch": 1.49, "grad_norm": 0.2294921875, "learning_rate": 3.734443158965499e-05, "loss": 2.1392, "step": 8785 }, { "epoch": 1.49, "grad_norm": 0.2275390625, "learning_rate": 3.722929898685507e-05, "loss": 2.1155, "step": 8790 }, { "epoch": 1.49, "grad_norm": 0.23046875, "learning_rate": 3.71143035275753e-05, "loss": 2.1582, "step": 8795 }, { "epoch": 1.49, "grad_norm": 0.2333984375, "learning_rate": 3.699944546306162e-05, "loss": 2.1508, "step": 8800 }, { "epoch": 1.49, "grad_norm": 0.2265625, "learning_rate": 3.6884725044259746e-05, "loss": 2.1642, "step": 8805 }, { "epoch": 1.49, "grad_norm": 0.2294921875, "learning_rate": 3.677014252181472e-05, "loss": 2.0776, "step": 8810 }, { "epoch": 1.49, "grad_norm": 0.228515625, "learning_rate": 3.665569814607017e-05, "loss": 2.1675, "step": 8815 }, { "epoch": 1.49, "grad_norm": 0.2255859375, "learning_rate": 3.6541392167068033e-05, "loss": 2.1034, "step": 8820 }, { "epoch": 1.49, "grad_norm": 0.2353515625, "learning_rate": 3.642722483454781e-05, "loss": 2.1548, "step": 8825 }, { "epoch": 1.5, "grad_norm": 0.2265625, "learning_rate": 3.6313196397946106e-05, "loss": 2.0931, "step": 8830 }, { "epoch": 1.5, "grad_norm": 0.2197265625, "learning_rate": 3.619930710639604e-05, "loss": 2.1602, "step": 8835 }, { "epoch": 1.5, "grad_norm": 0.23046875, "learning_rate": 3.608555720872678e-05, "loss": 2.0739, "step": 8840 }, { "epoch": 1.5, "grad_norm": 0.224609375, "learning_rate": 3.597194695346282e-05, "loss": 2.1437, "step": 8845 }, { "epoch": 1.5, "grad_norm": 0.2294921875, "learning_rate": 3.5858476588823664e-05, "loss": 2.1333, "step": 8850 }, { "epoch": 1.5, "grad_norm": 0.2470703125, "learning_rate": 3.574514636272318e-05, "loss": 2.1147, "step": 8855 }, { "epoch": 1.5, "grad_norm": 0.2255859375, "learning_rate": 3.563195652276893e-05, "loss": 2.1096, "step": 8860 }, { "epoch": 1.5, "grad_norm": 0.2314453125, "learning_rate": 3.551890731626197e-05, "loss": 2.1184, "step": 8865 }, { "epoch": 1.5, "grad_norm": 0.224609375, "learning_rate": 3.54059989901959e-05, "loss": 2.1549, "step": 8870 }, { "epoch": 1.5, "grad_norm": 0.228515625, "learning_rate": 3.529323179125661e-05, "loss": 2.1688, "step": 8875 }, { "epoch": 1.5, "grad_norm": 0.2294921875, "learning_rate": 3.518060596582167e-05, "loss": 2.1652, "step": 8880 }, { "epoch": 1.5, "grad_norm": 0.2255859375, "learning_rate": 3.506812175995967e-05, "loss": 2.1266, "step": 8885 }, { "epoch": 1.51, "grad_norm": 0.22265625, "learning_rate": 3.4955779419429856e-05, "loss": 2.1351, "step": 8890 }, { "epoch": 1.51, "grad_norm": 0.22265625, "learning_rate": 3.484357918968163e-05, "loss": 2.1242, "step": 8895 }, { "epoch": 1.51, "grad_norm": 0.224609375, "learning_rate": 3.4731521315853675e-05, "loss": 2.1029, "step": 8900 }, { "epoch": 1.51, "grad_norm": 0.2353515625, "learning_rate": 3.461960604277381e-05, "loss": 2.129, "step": 8905 }, { "epoch": 1.51, "grad_norm": 0.2236328125, "learning_rate": 3.45078336149583e-05, "loss": 2.1262, "step": 8910 }, { "epoch": 1.51, "grad_norm": 0.228515625, "learning_rate": 3.439620427661119e-05, "loss": 2.14, "step": 8915 }, { "epoch": 1.51, "grad_norm": 0.224609375, "learning_rate": 3.4284718271624015e-05, "loss": 2.1609, "step": 8920 }, { "epoch": 1.51, "grad_norm": 0.2314453125, "learning_rate": 3.417337584357512e-05, "loss": 2.0996, "step": 8925 }, { "epoch": 1.51, "grad_norm": 0.22265625, "learning_rate": 3.4062177235729145e-05, "loss": 2.0893, "step": 8930 }, { "epoch": 1.51, "grad_norm": 0.2255859375, "learning_rate": 3.3951122691036564e-05, "loss": 2.1178, "step": 8935 }, { "epoch": 1.51, "grad_norm": 0.2216796875, "learning_rate": 3.384021245213297e-05, "loss": 2.1169, "step": 8940 }, { "epoch": 1.51, "grad_norm": 0.22265625, "learning_rate": 3.372944676133878e-05, "loss": 2.1666, "step": 8945 }, { "epoch": 1.52, "grad_norm": 0.2294921875, "learning_rate": 3.3618825860658576e-05, "loss": 2.1317, "step": 8950 }, { "epoch": 1.52, "grad_norm": 0.228515625, "learning_rate": 3.35083499917806e-05, "loss": 2.1147, "step": 8955 }, { "epoch": 1.52, "grad_norm": 0.224609375, "learning_rate": 3.3398019396076184e-05, "loss": 2.1252, "step": 8960 }, { "epoch": 1.52, "grad_norm": 0.224609375, "learning_rate": 3.328783431459936e-05, "loss": 2.0857, "step": 8965 }, { "epoch": 1.52, "grad_norm": 0.228515625, "learning_rate": 3.3177794988086074e-05, "loss": 2.1214, "step": 8970 }, { "epoch": 1.52, "grad_norm": 0.234375, "learning_rate": 3.306790165695396e-05, "loss": 2.0711, "step": 8975 }, { "epoch": 1.52, "grad_norm": 0.232421875, "learning_rate": 3.295815456130162e-05, "loss": 2.1091, "step": 8980 }, { "epoch": 1.52, "grad_norm": 0.232421875, "learning_rate": 3.2848553940908186e-05, "loss": 2.134, "step": 8985 }, { "epoch": 1.52, "grad_norm": 0.224609375, "learning_rate": 3.2739100035232776e-05, "loss": 2.103, "step": 8990 }, { "epoch": 1.52, "grad_norm": 0.2353515625, "learning_rate": 3.262979308341385e-05, "loss": 2.1696, "step": 8995 }, { "epoch": 1.52, "grad_norm": 0.2275390625, "learning_rate": 3.2520633324268924e-05, "loss": 2.1352, "step": 9000 }, { "epoch": 1.53, "grad_norm": 0.224609375, "learning_rate": 3.2411620996293876e-05, "loss": 2.1056, "step": 9005 }, { "epoch": 1.53, "grad_norm": 0.2294921875, "learning_rate": 3.230275633766248e-05, "loss": 2.1891, "step": 9010 }, { "epoch": 1.53, "grad_norm": 0.2275390625, "learning_rate": 3.219403958622587e-05, "loss": 2.1408, "step": 9015 }, { "epoch": 1.53, "grad_norm": 0.228515625, "learning_rate": 3.208547097951206e-05, "loss": 2.1556, "step": 9020 }, { "epoch": 1.53, "grad_norm": 0.2265625, "learning_rate": 3.197705075472529e-05, "loss": 2.1061, "step": 9025 }, { "epoch": 1.53, "grad_norm": 0.2294921875, "learning_rate": 3.186877914874572e-05, "loss": 2.1357, "step": 9030 }, { "epoch": 1.53, "grad_norm": 0.2255859375, "learning_rate": 3.1760656398128764e-05, "loss": 2.1443, "step": 9035 }, { "epoch": 1.53, "grad_norm": 0.2275390625, "learning_rate": 3.165268273910461e-05, "loss": 2.0885, "step": 9040 }, { "epoch": 1.53, "grad_norm": 0.2255859375, "learning_rate": 3.154485840757775e-05, "loss": 2.0986, "step": 9045 }, { "epoch": 1.53, "grad_norm": 0.2294921875, "learning_rate": 3.14371836391263e-05, "loss": 2.1272, "step": 9050 }, { "epoch": 1.53, "grad_norm": 0.228515625, "learning_rate": 3.1329658669001724e-05, "loss": 2.1175, "step": 9055 }, { "epoch": 1.53, "grad_norm": 0.2314453125, "learning_rate": 3.1222283732128186e-05, "loss": 2.1358, "step": 9060 }, { "epoch": 1.54, "grad_norm": 0.228515625, "learning_rate": 3.111505906310194e-05, "loss": 2.1075, "step": 9065 }, { "epoch": 1.54, "grad_norm": 0.228515625, "learning_rate": 3.100798489619111e-05, "loss": 2.1126, "step": 9070 }, { "epoch": 1.54, "grad_norm": 0.2236328125, "learning_rate": 3.0901061465334905e-05, "loss": 2.1089, "step": 9075 }, { "epoch": 1.54, "grad_norm": 0.2294921875, "learning_rate": 3.079428900414314e-05, "loss": 2.1171, "step": 9080 }, { "epoch": 1.54, "grad_norm": 0.224609375, "learning_rate": 3.0687667745895876e-05, "loss": 2.1315, "step": 9085 }, { "epoch": 1.54, "grad_norm": 0.2294921875, "learning_rate": 3.058119792354283e-05, "loss": 2.1353, "step": 9090 }, { "epoch": 1.54, "grad_norm": 0.2314453125, "learning_rate": 3.0474879769702703e-05, "loss": 2.1024, "step": 9095 }, { "epoch": 1.54, "grad_norm": 0.23046875, "learning_rate": 3.0368713516663093e-05, "loss": 2.1436, "step": 9100 }, { "epoch": 1.54, "grad_norm": 0.23046875, "learning_rate": 3.0262699396379467e-05, "loss": 2.157, "step": 9105 }, { "epoch": 1.54, "grad_norm": 0.2275390625, "learning_rate": 3.0156837640475046e-05, "loss": 2.1166, "step": 9110 }, { "epoch": 1.54, "grad_norm": 0.2275390625, "learning_rate": 3.0051128480240143e-05, "loss": 2.1443, "step": 9115 }, { "epoch": 1.54, "grad_norm": 0.2236328125, "learning_rate": 2.9945572146631605e-05, "loss": 2.1404, "step": 9120 }, { "epoch": 1.55, "grad_norm": 0.2255859375, "learning_rate": 2.9840168870272413e-05, "loss": 2.0834, "step": 9125 }, { "epoch": 1.55, "grad_norm": 0.2265625, "learning_rate": 2.973491888145127e-05, "loss": 2.1451, "step": 9130 }, { "epoch": 1.55, "grad_norm": 0.2294921875, "learning_rate": 2.9629822410121754e-05, "loss": 2.1062, "step": 9135 }, { "epoch": 1.55, "grad_norm": 0.22265625, "learning_rate": 2.9524879685902173e-05, "loss": 2.112, "step": 9140 }, { "epoch": 1.55, "grad_norm": 0.2255859375, "learning_rate": 2.9420090938074917e-05, "loss": 2.1231, "step": 9145 }, { "epoch": 1.55, "grad_norm": 0.2333984375, "learning_rate": 2.9315456395585884e-05, "loss": 2.1256, "step": 9150 }, { "epoch": 1.55, "grad_norm": 0.2275390625, "learning_rate": 2.9210976287044144e-05, "loss": 2.1237, "step": 9155 }, { "epoch": 1.55, "grad_norm": 0.2353515625, "learning_rate": 2.9106650840721305e-05, "loss": 2.1511, "step": 9160 }, { "epoch": 1.55, "grad_norm": 0.21875, "learning_rate": 2.9002480284551094e-05, "loss": 2.1458, "step": 9165 }, { "epoch": 1.55, "grad_norm": 0.2255859375, "learning_rate": 2.8898464846128837e-05, "loss": 2.1324, "step": 9170 }, { "epoch": 1.55, "grad_norm": 0.232421875, "learning_rate": 2.8794604752710873e-05, "loss": 2.1192, "step": 9175 }, { "epoch": 1.55, "grad_norm": 0.23046875, "learning_rate": 2.8690900231214224e-05, "loss": 2.1224, "step": 9180 }, { "epoch": 1.56, "grad_norm": 0.2275390625, "learning_rate": 2.8587351508215997e-05, "loss": 2.1159, "step": 9185 }, { "epoch": 1.56, "grad_norm": 0.2314453125, "learning_rate": 2.8483958809952883e-05, "loss": 2.1377, "step": 9190 }, { "epoch": 1.56, "grad_norm": 0.232421875, "learning_rate": 2.838072236232069e-05, "loss": 2.1342, "step": 9195 }, { "epoch": 1.56, "grad_norm": 0.220703125, "learning_rate": 2.8277642390873904e-05, "loss": 2.1474, "step": 9200 }, { "epoch": 1.56, "grad_norm": 0.2236328125, "learning_rate": 2.8174719120825e-05, "loss": 2.0472, "step": 9205 }, { "epoch": 1.56, "grad_norm": 0.228515625, "learning_rate": 2.8071952777044208e-05, "loss": 2.1405, "step": 9210 }, { "epoch": 1.56, "grad_norm": 0.240234375, "learning_rate": 2.796934358405887e-05, "loss": 2.135, "step": 9215 }, { "epoch": 1.56, "grad_norm": 0.23046875, "learning_rate": 2.786689176605295e-05, "loss": 2.178, "step": 9220 }, { "epoch": 1.56, "grad_norm": 0.2265625, "learning_rate": 2.7764597546866656e-05, "loss": 2.1374, "step": 9225 }, { "epoch": 1.56, "grad_norm": 0.2275390625, "learning_rate": 2.7662461149995723e-05, "loss": 2.1224, "step": 9230 }, { "epoch": 1.56, "grad_norm": 0.2265625, "learning_rate": 2.7560482798591193e-05, "loss": 2.0993, "step": 9235 }, { "epoch": 1.56, "grad_norm": 0.23046875, "learning_rate": 2.745866271545876e-05, "loss": 2.1677, "step": 9240 }, { "epoch": 1.57, "grad_norm": 0.2314453125, "learning_rate": 2.7357001123058358e-05, "loss": 2.1336, "step": 9245 }, { "epoch": 1.57, "grad_norm": 0.2314453125, "learning_rate": 2.7255498243503607e-05, "loss": 2.1442, "step": 9250 }, { "epoch": 1.57, "grad_norm": 0.2275390625, "learning_rate": 2.7154154298561407e-05, "loss": 2.0766, "step": 9255 }, { "epoch": 1.57, "grad_norm": 0.2275390625, "learning_rate": 2.705296950965135e-05, "loss": 2.1467, "step": 9260 }, { "epoch": 1.57, "grad_norm": 0.2333984375, "learning_rate": 2.695194409784534e-05, "loss": 2.1041, "step": 9265 }, { "epoch": 1.57, "grad_norm": 0.21875, "learning_rate": 2.685107828386708e-05, "loss": 2.0962, "step": 9270 }, { "epoch": 1.57, "grad_norm": 0.2236328125, "learning_rate": 2.6750372288091563e-05, "loss": 2.0952, "step": 9275 }, { "epoch": 1.57, "grad_norm": 0.234375, "learning_rate": 2.6649826330544624e-05, "loss": 2.1158, "step": 9280 }, { "epoch": 1.57, "grad_norm": 0.244140625, "learning_rate": 2.6549440630902377e-05, "loss": 2.0895, "step": 9285 }, { "epoch": 1.57, "grad_norm": 0.2265625, "learning_rate": 2.644921540849087e-05, "loss": 2.1119, "step": 9290 }, { "epoch": 1.57, "grad_norm": 0.2236328125, "learning_rate": 2.6349150882285535e-05, "loss": 2.1148, "step": 9295 }, { "epoch": 1.58, "grad_norm": 0.2294921875, "learning_rate": 2.6249247270910594e-05, "loss": 2.0864, "step": 9300 }, { "epoch": 1.58, "grad_norm": 0.2236328125, "learning_rate": 2.614950479263889e-05, "loss": 2.1098, "step": 9305 }, { "epoch": 1.58, "grad_norm": 0.228515625, "learning_rate": 2.6049923665391108e-05, "loss": 2.1359, "step": 9310 }, { "epoch": 1.58, "grad_norm": 0.2294921875, "learning_rate": 2.5950504106735353e-05, "loss": 2.1003, "step": 9315 }, { "epoch": 1.58, "grad_norm": 0.228515625, "learning_rate": 2.5851246333886815e-05, "loss": 2.1277, "step": 9320 }, { "epoch": 1.58, "grad_norm": 0.220703125, "learning_rate": 2.5752150563707234e-05, "loss": 2.0998, "step": 9325 }, { "epoch": 1.58, "grad_norm": 0.23046875, "learning_rate": 2.5653217012704244e-05, "loss": 2.1263, "step": 9330 }, { "epoch": 1.58, "grad_norm": 0.234375, "learning_rate": 2.5554445897031286e-05, "loss": 2.0996, "step": 9335 }, { "epoch": 1.58, "grad_norm": 0.23046875, "learning_rate": 2.5455837432486707e-05, "loss": 2.0911, "step": 9340 }, { "epoch": 1.58, "grad_norm": 0.228515625, "learning_rate": 2.5357391834513588e-05, "loss": 2.1413, "step": 9345 }, { "epoch": 1.58, "grad_norm": 0.2275390625, "learning_rate": 2.5259109318199194e-05, "loss": 2.1594, "step": 9350 }, { "epoch": 1.58, "grad_norm": 0.2265625, "learning_rate": 2.5160990098274373e-05, "loss": 2.0828, "step": 9355 }, { "epoch": 1.59, "grad_norm": 0.2275390625, "learning_rate": 2.5063034389113282e-05, "loss": 2.1489, "step": 9360 }, { "epoch": 1.59, "grad_norm": 0.236328125, "learning_rate": 2.4965242404732892e-05, "loss": 2.1443, "step": 9365 }, { "epoch": 1.59, "grad_norm": 0.228515625, "learning_rate": 2.48676143587923e-05, "loss": 2.1547, "step": 9370 }, { "epoch": 1.59, "grad_norm": 0.2255859375, "learning_rate": 2.4770150464592566e-05, "loss": 2.0968, "step": 9375 }, { "epoch": 1.59, "grad_norm": 0.2255859375, "learning_rate": 2.4672850935076065e-05, "loss": 2.0872, "step": 9380 }, { "epoch": 1.59, "grad_norm": 0.228515625, "learning_rate": 2.4575715982825997e-05, "loss": 2.1518, "step": 9385 }, { "epoch": 1.59, "grad_norm": 0.2216796875, "learning_rate": 2.4478745820066084e-05, "loss": 2.1032, "step": 9390 }, { "epoch": 1.59, "grad_norm": 0.23828125, "learning_rate": 2.4381940658659963e-05, "loss": 2.102, "step": 9395 }, { "epoch": 1.59, "grad_norm": 0.2333984375, "learning_rate": 2.4285300710110782e-05, "loss": 2.1821, "step": 9400 }, { "epoch": 1.59, "grad_norm": 0.224609375, "learning_rate": 2.4188826185560743e-05, "loss": 2.0965, "step": 9405 }, { "epoch": 1.59, "grad_norm": 0.2255859375, "learning_rate": 2.409251729579055e-05, "loss": 2.1287, "step": 9410 }, { "epoch": 1.59, "grad_norm": 0.2314453125, "learning_rate": 2.399637425121911e-05, "loss": 2.1487, "step": 9415 }, { "epoch": 1.6, "grad_norm": 0.2177734375, "learning_rate": 2.390039726190295e-05, "loss": 2.1267, "step": 9420 }, { "epoch": 1.6, "grad_norm": 0.2294921875, "learning_rate": 2.380458653753579e-05, "loss": 2.1301, "step": 9425 }, { "epoch": 1.6, "grad_norm": 0.2314453125, "learning_rate": 2.370894228744809e-05, "loss": 2.1212, "step": 9430 }, { "epoch": 1.6, "grad_norm": 0.216796875, "learning_rate": 2.3613464720606637e-05, "loss": 2.0878, "step": 9435 }, { "epoch": 1.6, "grad_norm": 0.2265625, "learning_rate": 2.351815404561394e-05, "loss": 2.1501, "step": 9440 }, { "epoch": 1.6, "grad_norm": 0.224609375, "learning_rate": 2.3423010470707972e-05, "loss": 2.1325, "step": 9445 }, { "epoch": 1.6, "grad_norm": 0.2294921875, "learning_rate": 2.3328034203761582e-05, "loss": 2.1175, "step": 9450 }, { "epoch": 1.6, "grad_norm": 0.228515625, "learning_rate": 2.323322545228208e-05, "loss": 2.1565, "step": 9455 }, { "epoch": 1.6, "grad_norm": 0.23046875, "learning_rate": 2.3138584423410823e-05, "loss": 2.126, "step": 9460 }, { "epoch": 1.6, "grad_norm": 0.2275390625, "learning_rate": 2.3044111323922623e-05, "loss": 2.1131, "step": 9465 }, { "epoch": 1.6, "grad_norm": 0.23046875, "learning_rate": 2.2949806360225502e-05, "loss": 2.1226, "step": 9470 }, { "epoch": 1.6, "grad_norm": 0.2333984375, "learning_rate": 2.2855669738360064e-05, "loss": 2.1327, "step": 9475 }, { "epoch": 1.61, "grad_norm": 0.224609375, "learning_rate": 2.2761701663999158e-05, "loss": 2.1363, "step": 9480 }, { "epoch": 1.61, "grad_norm": 0.2275390625, "learning_rate": 2.2667902342447356e-05, "loss": 2.0965, "step": 9485 }, { "epoch": 1.61, "grad_norm": 0.2373046875, "learning_rate": 2.2574271978640572e-05, "loss": 2.1373, "step": 9490 }, { "epoch": 1.61, "grad_norm": 0.2236328125, "learning_rate": 2.248081077714549e-05, "loss": 2.1131, "step": 9495 }, { "epoch": 1.61, "grad_norm": 0.224609375, "learning_rate": 2.2387518942159292e-05, "loss": 2.1056, "step": 9500 }, { "epoch": 1.61, "grad_norm": 0.2294921875, "learning_rate": 2.2294396677509078e-05, "loss": 2.1192, "step": 9505 }, { "epoch": 1.61, "grad_norm": 0.2294921875, "learning_rate": 2.2201444186651487e-05, "loss": 2.1341, "step": 9510 }, { "epoch": 1.61, "grad_norm": 0.224609375, "learning_rate": 2.210866167267225e-05, "loss": 2.0922, "step": 9515 }, { "epoch": 1.61, "grad_norm": 0.22265625, "learning_rate": 2.2016049338285628e-05, "loss": 2.1433, "step": 9520 }, { "epoch": 1.61, "grad_norm": 0.2275390625, "learning_rate": 2.1923607385834167e-05, "loss": 2.1042, "step": 9525 }, { "epoch": 1.61, "grad_norm": 0.2265625, "learning_rate": 2.1831336017288174e-05, "loss": 2.0894, "step": 9530 }, { "epoch": 1.61, "grad_norm": 0.228515625, "learning_rate": 2.1739235434245097e-05, "loss": 2.1704, "step": 9535 }, { "epoch": 1.62, "grad_norm": 0.234375, "learning_rate": 2.1647305837929466e-05, "loss": 2.0889, "step": 9540 }, { "epoch": 1.62, "grad_norm": 0.2333984375, "learning_rate": 2.1555547429192112e-05, "loss": 2.0969, "step": 9545 }, { "epoch": 1.62, "grad_norm": 0.2265625, "learning_rate": 2.1463960408509832e-05, "loss": 2.136, "step": 9550 }, { "epoch": 1.62, "grad_norm": 0.2236328125, "learning_rate": 2.137254497598501e-05, "loss": 2.1246, "step": 9555 }, { "epoch": 1.62, "grad_norm": 0.228515625, "learning_rate": 2.128130133134516e-05, "loss": 2.1073, "step": 9560 }, { "epoch": 1.62, "grad_norm": 0.224609375, "learning_rate": 2.1190229673942363e-05, "loss": 2.142, "step": 9565 }, { "epoch": 1.62, "grad_norm": 0.236328125, "learning_rate": 2.109933020275312e-05, "loss": 2.1124, "step": 9570 }, { "epoch": 1.62, "grad_norm": 0.2236328125, "learning_rate": 2.1008603116377545e-05, "loss": 2.1026, "step": 9575 }, { "epoch": 1.62, "grad_norm": 0.2265625, "learning_rate": 2.091804861303922e-05, "loss": 2.1151, "step": 9580 }, { "epoch": 1.62, "grad_norm": 0.2294921875, "learning_rate": 2.0827666890584685e-05, "loss": 2.0735, "step": 9585 }, { "epoch": 1.62, "grad_norm": 0.2265625, "learning_rate": 2.073745814648287e-05, "loss": 2.119, "step": 9590 }, { "epoch": 1.63, "grad_norm": 0.228515625, "learning_rate": 2.0647422577824882e-05, "loss": 2.1127, "step": 9595 }, { "epoch": 1.63, "grad_norm": 0.224609375, "learning_rate": 2.0557560381323437e-05, "loss": 2.1275, "step": 9600 }, { "epoch": 1.63, "grad_norm": 0.236328125, "learning_rate": 2.046787175331244e-05, "loss": 2.1583, "step": 9605 }, { "epoch": 1.63, "grad_norm": 0.2236328125, "learning_rate": 2.037835688974662e-05, "loss": 2.1137, "step": 9610 }, { "epoch": 1.63, "grad_norm": 0.2265625, "learning_rate": 2.0289015986201043e-05, "loss": 2.086, "step": 9615 }, { "epoch": 1.63, "grad_norm": 0.23046875, "learning_rate": 2.019984923787065e-05, "loss": 2.1226, "step": 9620 }, { "epoch": 1.63, "grad_norm": 0.228515625, "learning_rate": 2.0110856839569947e-05, "loss": 2.1492, "step": 9625 }, { "epoch": 1.63, "grad_norm": 0.232421875, "learning_rate": 2.0022038985732495e-05, "loss": 2.1303, "step": 9630 }, { "epoch": 1.63, "grad_norm": 0.228515625, "learning_rate": 1.99333958704105e-05, "loss": 2.1495, "step": 9635 }, { "epoch": 1.63, "grad_norm": 0.2333984375, "learning_rate": 1.984492768727443e-05, "loss": 2.1262, "step": 9640 }, { "epoch": 1.63, "grad_norm": 0.2294921875, "learning_rate": 1.9756634629612447e-05, "loss": 2.1363, "step": 9645 }, { "epoch": 1.63, "grad_norm": 0.234375, "learning_rate": 1.9668516890330212e-05, "loss": 2.1487, "step": 9650 }, { "epoch": 1.64, "grad_norm": 0.224609375, "learning_rate": 1.95805746619503e-05, "loss": 2.1058, "step": 9655 }, { "epoch": 1.64, "grad_norm": 0.2314453125, "learning_rate": 1.9492808136611818e-05, "loss": 2.1014, "step": 9660 }, { "epoch": 1.64, "grad_norm": 0.2294921875, "learning_rate": 1.9405217506069994e-05, "loss": 2.1296, "step": 9665 }, { "epoch": 1.64, "grad_norm": 0.2265625, "learning_rate": 1.9317802961695786e-05, "loss": 2.1045, "step": 9670 }, { "epoch": 1.64, "grad_norm": 0.224609375, "learning_rate": 1.923056469447535e-05, "loss": 2.1638, "step": 9675 }, { "epoch": 1.64, "grad_norm": 0.2216796875, "learning_rate": 1.914350289500979e-05, "loss": 2.1128, "step": 9680 }, { "epoch": 1.64, "grad_norm": 0.2255859375, "learning_rate": 1.9056617753514628e-05, "loss": 2.1096, "step": 9685 }, { "epoch": 1.64, "grad_norm": 0.232421875, "learning_rate": 1.8969909459819412e-05, "loss": 2.1324, "step": 9690 }, { "epoch": 1.64, "grad_norm": 0.228515625, "learning_rate": 1.888337820336735e-05, "loss": 2.1221, "step": 9695 }, { "epoch": 1.64, "grad_norm": 0.2236328125, "learning_rate": 1.879702417321475e-05, "loss": 2.112, "step": 9700 }, { "epoch": 1.64, "grad_norm": 0.23046875, "learning_rate": 1.871084755803082e-05, "loss": 2.137, "step": 9705 }, { "epoch": 1.64, "grad_norm": 0.2255859375, "learning_rate": 1.8624848546097086e-05, "loss": 2.1575, "step": 9710 }, { "epoch": 1.65, "grad_norm": 0.2294921875, "learning_rate": 1.8539027325307056e-05, "loss": 2.1784, "step": 9715 }, { "epoch": 1.65, "grad_norm": 0.23046875, "learning_rate": 1.8453384083165803e-05, "loss": 2.0949, "step": 9720 }, { "epoch": 1.65, "grad_norm": 0.224609375, "learning_rate": 1.8367919006789558e-05, "loss": 2.1114, "step": 9725 }, { "epoch": 1.65, "grad_norm": 0.2236328125, "learning_rate": 1.828263228290522e-05, "loss": 2.1596, "step": 9730 }, { "epoch": 1.65, "grad_norm": 0.2265625, "learning_rate": 1.8197524097850095e-05, "loss": 2.1079, "step": 9735 }, { "epoch": 1.65, "grad_norm": 0.2275390625, "learning_rate": 1.8112594637571366e-05, "loss": 2.0991, "step": 9740 }, { "epoch": 1.65, "grad_norm": 0.2353515625, "learning_rate": 1.802784408762578e-05, "loss": 2.1254, "step": 9745 }, { "epoch": 1.65, "grad_norm": 0.224609375, "learning_rate": 1.7943272633179166e-05, "loss": 2.0966, "step": 9750 }, { "epoch": 1.65, "grad_norm": 0.234375, "learning_rate": 1.7858880459006e-05, "loss": 2.1437, "step": 9755 }, { "epoch": 1.65, "grad_norm": 0.2275390625, "learning_rate": 1.777466774948916e-05, "loss": 2.1718, "step": 9760 }, { "epoch": 1.65, "grad_norm": 0.22265625, "learning_rate": 1.769063468861941e-05, "loss": 2.1158, "step": 9765 }, { "epoch": 1.65, "grad_norm": 0.21875, "learning_rate": 1.7606781459994913e-05, "loss": 2.0889, "step": 9770 }, { "epoch": 1.66, "grad_norm": 0.2421875, "learning_rate": 1.7523108246821017e-05, "loss": 2.1166, "step": 9775 }, { "epoch": 1.66, "grad_norm": 0.2265625, "learning_rate": 1.743961523190981e-05, "loss": 2.0749, "step": 9780 }, { "epoch": 1.66, "grad_norm": 0.23046875, "learning_rate": 1.7356302597679554e-05, "loss": 2.1447, "step": 9785 }, { "epoch": 1.66, "grad_norm": 0.2255859375, "learning_rate": 1.727317052615447e-05, "loss": 2.121, "step": 9790 }, { "epoch": 1.66, "grad_norm": 0.224609375, "learning_rate": 1.719021919896433e-05, "loss": 2.0826, "step": 9795 }, { "epoch": 1.66, "grad_norm": 0.228515625, "learning_rate": 1.7107448797343893e-05, "loss": 2.102, "step": 9800 }, { "epoch": 1.66, "grad_norm": 0.2314453125, "learning_rate": 1.7024859502132696e-05, "loss": 2.1022, "step": 9805 }, { "epoch": 1.66, "grad_norm": 0.22265625, "learning_rate": 1.6942451493774657e-05, "loss": 2.0963, "step": 9810 }, { "epoch": 1.66, "grad_norm": 0.2294921875, "learning_rate": 1.6860224952317473e-05, "loss": 2.1186, "step": 9815 }, { "epoch": 1.66, "grad_norm": 0.236328125, "learning_rate": 1.6778180057412486e-05, "loss": 2.1112, "step": 9820 }, { "epoch": 1.66, "grad_norm": 0.220703125, "learning_rate": 1.6696316988314043e-05, "loss": 2.1388, "step": 9825 }, { "epoch": 1.66, "grad_norm": 0.232421875, "learning_rate": 1.6614635923879362e-05, "loss": 2.1583, "step": 9830 }, { "epoch": 1.67, "grad_norm": 0.228515625, "learning_rate": 1.6533137042567936e-05, "loss": 2.1003, "step": 9835 }, { "epoch": 1.67, "grad_norm": 0.220703125, "learning_rate": 1.645182052244124e-05, "loss": 2.111, "step": 9840 }, { "epoch": 1.67, "grad_norm": 0.2333984375, "learning_rate": 1.6370686541162327e-05, "loss": 2.122, "step": 9845 }, { "epoch": 1.67, "grad_norm": 0.2265625, "learning_rate": 1.6289735275995433e-05, "loss": 2.0957, "step": 9850 }, { "epoch": 1.67, "grad_norm": 0.2294921875, "learning_rate": 1.6208966903805555e-05, "loss": 2.0987, "step": 9855 }, { "epoch": 1.67, "grad_norm": 0.228515625, "learning_rate": 1.6128381601058128e-05, "loss": 2.0697, "step": 9860 }, { "epoch": 1.67, "grad_norm": 0.2314453125, "learning_rate": 1.6047979543818624e-05, "loss": 2.1318, "step": 9865 }, { "epoch": 1.67, "grad_norm": 0.2236328125, "learning_rate": 1.5967760907752115e-05, "loss": 2.1134, "step": 9870 }, { "epoch": 1.67, "grad_norm": 0.2333984375, "learning_rate": 1.5887725868123006e-05, "loss": 2.1264, "step": 9875 }, { "epoch": 1.67, "grad_norm": 0.232421875, "learning_rate": 1.580787459979446e-05, "loss": 2.0945, "step": 9880 }, { "epoch": 1.67, "grad_norm": 0.2255859375, "learning_rate": 1.57282072772282e-05, "loss": 2.0919, "step": 9885 }, { "epoch": 1.67, "grad_norm": 0.2236328125, "learning_rate": 1.5648724074484056e-05, "loss": 2.1147, "step": 9890 }, { "epoch": 1.68, "grad_norm": 0.2294921875, "learning_rate": 1.5569425165219586e-05, "loss": 2.107, "step": 9895 }, { "epoch": 1.68, "grad_norm": 0.232421875, "learning_rate": 1.5490310722689693e-05, "loss": 2.0979, "step": 9900 }, { "epoch": 1.68, "grad_norm": 0.2353515625, "learning_rate": 1.5411380919746255e-05, "loss": 2.0866, "step": 9905 }, { "epoch": 1.68, "grad_norm": 0.228515625, "learning_rate": 1.5332635928837714e-05, "loss": 2.1099, "step": 9910 }, { "epoch": 1.68, "grad_norm": 0.2314453125, "learning_rate": 1.5254075922008748e-05, "loss": 2.1573, "step": 9915 }, { "epoch": 1.68, "grad_norm": 0.228515625, "learning_rate": 1.5175701070899896e-05, "loss": 2.134, "step": 9920 }, { "epoch": 1.68, "grad_norm": 0.23046875, "learning_rate": 1.5097511546747146e-05, "loss": 2.1199, "step": 9925 }, { "epoch": 1.68, "grad_norm": 0.224609375, "learning_rate": 1.501950752038158e-05, "loss": 2.1321, "step": 9930 }, { "epoch": 1.68, "grad_norm": 0.2294921875, "learning_rate": 1.4941689162228977e-05, "loss": 2.1165, "step": 9935 }, { "epoch": 1.68, "grad_norm": 0.228515625, "learning_rate": 1.4864056642309499e-05, "loss": 2.1185, "step": 9940 }, { "epoch": 1.68, "grad_norm": 0.224609375, "learning_rate": 1.4786610130237244e-05, "loss": 2.1314, "step": 9945 }, { "epoch": 1.69, "grad_norm": 0.228515625, "learning_rate": 1.4709349795219939e-05, "loss": 2.0686, "step": 9950 }, { "epoch": 1.69, "grad_norm": 0.2353515625, "learning_rate": 1.4632275806058559e-05, "loss": 2.1141, "step": 9955 }, { "epoch": 1.69, "grad_norm": 0.2333984375, "learning_rate": 1.4555388331146924e-05, "loss": 2.1641, "step": 9960 }, { "epoch": 1.69, "grad_norm": 0.2294921875, "learning_rate": 1.4478687538471313e-05, "loss": 2.0876, "step": 9965 }, { "epoch": 1.69, "grad_norm": 0.224609375, "learning_rate": 1.4402173595610213e-05, "loss": 2.132, "step": 9970 }, { "epoch": 1.69, "grad_norm": 0.2275390625, "learning_rate": 1.4325846669733844e-05, "loss": 2.0967, "step": 9975 }, { "epoch": 1.69, "grad_norm": 0.2333984375, "learning_rate": 1.4249706927603756e-05, "loss": 2.1232, "step": 9980 }, { "epoch": 1.69, "grad_norm": 0.2265625, "learning_rate": 1.4173754535572658e-05, "loss": 2.0908, "step": 9985 }, { "epoch": 1.69, "grad_norm": 0.2236328125, "learning_rate": 1.4097989659583876e-05, "loss": 2.1086, "step": 9990 }, { "epoch": 1.69, "grad_norm": 0.22265625, "learning_rate": 1.4022412465170987e-05, "loss": 2.117, "step": 9995 }, { "epoch": 1.69, "grad_norm": 0.23046875, "learning_rate": 1.3947023117457613e-05, "loss": 2.1503, "step": 10000 }, { "epoch": 1.69, "grad_norm": 0.236328125, "learning_rate": 1.3871821781156858e-05, "loss": 2.1238, "step": 10005 }, { "epoch": 1.7, "grad_norm": 0.2294921875, "learning_rate": 1.3796808620571121e-05, "loss": 2.124, "step": 10010 }, { "epoch": 1.7, "grad_norm": 0.228515625, "learning_rate": 1.3721983799591732e-05, "loss": 2.1265, "step": 10015 }, { "epoch": 1.7, "grad_norm": 0.240234375, "learning_rate": 1.3647347481698358e-05, "loss": 2.1128, "step": 10020 }, { "epoch": 1.7, "grad_norm": 0.22265625, "learning_rate": 1.3572899829958963e-05, "loss": 2.109, "step": 10025 }, { "epoch": 1.7, "grad_norm": 0.2197265625, "learning_rate": 1.3498641007029278e-05, "loss": 2.1203, "step": 10030 }, { "epoch": 1.7, "grad_norm": 0.232421875, "learning_rate": 1.342457117515239e-05, "loss": 2.1492, "step": 10035 }, { "epoch": 1.7, "grad_norm": 0.2216796875, "learning_rate": 1.3350690496158558e-05, "loss": 2.0852, "step": 10040 }, { "epoch": 1.7, "grad_norm": 0.2255859375, "learning_rate": 1.3276999131464818e-05, "loss": 2.1232, "step": 10045 }, { "epoch": 1.7, "grad_norm": 0.228515625, "learning_rate": 1.3203497242074437e-05, "loss": 2.1541, "step": 10050 }, { "epoch": 1.7, "grad_norm": 0.2314453125, "learning_rate": 1.3130184988576855e-05, "loss": 2.1114, "step": 10055 }, { "epoch": 1.7, "grad_norm": 0.2265625, "learning_rate": 1.3057062531147068e-05, "loss": 2.0998, "step": 10060 }, { "epoch": 1.7, "grad_norm": 0.2294921875, "learning_rate": 1.2984130029545494e-05, "loss": 2.1038, "step": 10065 }, { "epoch": 1.71, "grad_norm": 0.220703125, "learning_rate": 1.291138764311749e-05, "loss": 2.135, "step": 10070 }, { "epoch": 1.71, "grad_norm": 0.2236328125, "learning_rate": 1.2838835530793048e-05, "loss": 2.1491, "step": 10075 }, { "epoch": 1.71, "grad_norm": 0.2294921875, "learning_rate": 1.2766473851086435e-05, "loss": 2.1368, "step": 10080 }, { "epoch": 1.71, "grad_norm": 0.228515625, "learning_rate": 1.2694302762095889e-05, "loss": 2.0915, "step": 10085 }, { "epoch": 1.71, "grad_norm": 0.2255859375, "learning_rate": 1.2622322421503174e-05, "loss": 2.1016, "step": 10090 }, { "epoch": 1.71, "grad_norm": 0.2236328125, "learning_rate": 1.2550532986573349e-05, "loss": 2.1309, "step": 10095 }, { "epoch": 1.71, "grad_norm": 0.23046875, "learning_rate": 1.2478934614154359e-05, "loss": 2.1227, "step": 10100 }, { "epoch": 1.71, "grad_norm": 0.232421875, "learning_rate": 1.2407527460676727e-05, "loss": 2.1593, "step": 10105 }, { "epoch": 1.71, "grad_norm": 0.228515625, "learning_rate": 1.2336311682153201e-05, "loss": 2.1171, "step": 10110 }, { "epoch": 1.71, "grad_norm": 0.224609375, "learning_rate": 1.2265287434178352e-05, "loss": 2.0602, "step": 10115 }, { "epoch": 1.71, "grad_norm": 0.2236328125, "learning_rate": 1.2194454871928329e-05, "loss": 2.099, "step": 10120 }, { "epoch": 1.71, "grad_norm": 0.2314453125, "learning_rate": 1.2123814150160484e-05, "loss": 2.0976, "step": 10125 }, { "epoch": 1.72, "grad_norm": 0.2333984375, "learning_rate": 1.2053365423213026e-05, "loss": 2.1502, "step": 10130 }, { "epoch": 1.72, "grad_norm": 0.228515625, "learning_rate": 1.1983108845004675e-05, "loss": 2.1327, "step": 10135 }, { "epoch": 1.72, "grad_norm": 0.224609375, "learning_rate": 1.1913044569034382e-05, "loss": 2.1257, "step": 10140 }, { "epoch": 1.72, "grad_norm": 0.220703125, "learning_rate": 1.1843172748380848e-05, "loss": 2.1449, "step": 10145 }, { "epoch": 1.72, "grad_norm": 0.2314453125, "learning_rate": 1.1773493535702385e-05, "loss": 2.0872, "step": 10150 }, { "epoch": 1.72, "grad_norm": 0.2216796875, "learning_rate": 1.1704007083236457e-05, "loss": 2.1356, "step": 10155 }, { "epoch": 1.72, "grad_norm": 0.2236328125, "learning_rate": 1.1634713542799402e-05, "loss": 2.1342, "step": 10160 }, { "epoch": 1.72, "grad_norm": 0.23046875, "learning_rate": 1.1565613065786029e-05, "loss": 2.1246, "step": 10165 }, { "epoch": 1.72, "grad_norm": 0.2275390625, "learning_rate": 1.1496705803169405e-05, "loss": 2.1233, "step": 10170 }, { "epoch": 1.72, "grad_norm": 0.23046875, "learning_rate": 1.1427991905500369e-05, "loss": 2.1482, "step": 10175 }, { "epoch": 1.72, "grad_norm": 0.234375, "learning_rate": 1.1359471522907361e-05, "loss": 2.1573, "step": 10180 }, { "epoch": 1.72, "grad_norm": 0.228515625, "learning_rate": 1.1291144805095954e-05, "loss": 2.1015, "step": 10185 }, { "epoch": 1.73, "grad_norm": 0.2236328125, "learning_rate": 1.12230119013487e-05, "loss": 2.1213, "step": 10190 }, { "epoch": 1.73, "grad_norm": 0.2275390625, "learning_rate": 1.1155072960524626e-05, "loss": 2.1287, "step": 10195 }, { "epoch": 1.73, "grad_norm": 0.2275390625, "learning_rate": 1.1087328131058961e-05, "loss": 2.1512, "step": 10200 }, { "epoch": 1.73, "grad_norm": 0.2294921875, "learning_rate": 1.1019777560962885e-05, "loss": 2.1717, "step": 10205 }, { "epoch": 1.73, "grad_norm": 0.2236328125, "learning_rate": 1.0952421397823165e-05, "loss": 2.1036, "step": 10210 }, { "epoch": 1.73, "grad_norm": 0.228515625, "learning_rate": 1.0885259788801716e-05, "loss": 2.1408, "step": 10215 }, { "epoch": 1.73, "grad_norm": 0.224609375, "learning_rate": 1.0818292880635528e-05, "loss": 2.1403, "step": 10220 }, { "epoch": 1.73, "grad_norm": 0.228515625, "learning_rate": 1.0751520819636141e-05, "loss": 2.1093, "step": 10225 }, { "epoch": 1.73, "grad_norm": 0.232421875, "learning_rate": 1.0684943751689336e-05, "loss": 2.1154, "step": 10230 }, { "epoch": 1.73, "grad_norm": 0.234375, "learning_rate": 1.0618561822254935e-05, "loss": 2.1379, "step": 10235 }, { "epoch": 1.73, "grad_norm": 0.23046875, "learning_rate": 1.0552375176366369e-05, "loss": 2.1437, "step": 10240 }, { "epoch": 1.74, "grad_norm": 0.2451171875, "learning_rate": 1.048638395863043e-05, "loss": 2.0852, "step": 10245 }, { "epoch": 1.74, "grad_norm": 0.2236328125, "learning_rate": 1.0420588313226975e-05, "loss": 2.1063, "step": 10250 }, { "epoch": 1.74, "grad_norm": 0.23046875, "learning_rate": 1.0354988383908482e-05, "loss": 2.1128, "step": 10255 }, { "epoch": 1.74, "grad_norm": 0.232421875, "learning_rate": 1.0289584313999867e-05, "loss": 2.1065, "step": 10260 }, { "epoch": 1.74, "grad_norm": 0.228515625, "learning_rate": 1.0224376246398148e-05, "loss": 2.114, "step": 10265 }, { "epoch": 1.74, "grad_norm": 0.2265625, "learning_rate": 1.0159364323572052e-05, "loss": 2.1456, "step": 10270 }, { "epoch": 1.74, "grad_norm": 0.22265625, "learning_rate": 1.0094548687561777e-05, "loss": 2.0623, "step": 10275 }, { "epoch": 1.74, "grad_norm": 0.232421875, "learning_rate": 1.0029929479978773e-05, "loss": 2.08, "step": 10280 }, { "epoch": 1.74, "grad_norm": 0.224609375, "learning_rate": 9.965506842005145e-06, "loss": 2.1093, "step": 10285 }, { "epoch": 1.74, "grad_norm": 0.2314453125, "learning_rate": 9.901280914393696e-06, "loss": 2.0921, "step": 10290 }, { "epoch": 1.74, "grad_norm": 0.2275390625, "learning_rate": 9.83725183746731e-06, "loss": 2.1175, "step": 10295 }, { "epoch": 1.74, "grad_norm": 0.2265625, "learning_rate": 9.773419751118872e-06, "loss": 2.1462, "step": 10300 }, { "epoch": 1.75, "grad_norm": 0.23046875, "learning_rate": 9.70978479481085e-06, "loss": 2.1439, "step": 10305 }, { "epoch": 1.75, "grad_norm": 0.2236328125, "learning_rate": 9.646347107575037e-06, "loss": 2.1056, "step": 10310 }, { "epoch": 1.75, "grad_norm": 0.2333984375, "learning_rate": 9.58310682801219e-06, "loss": 2.1516, "step": 10315 }, { "epoch": 1.75, "grad_norm": 0.2216796875, "learning_rate": 9.520064094291791e-06, "loss": 2.1227, "step": 10320 }, { "epoch": 1.75, "grad_norm": 0.224609375, "learning_rate": 9.457219044151689e-06, "loss": 2.125, "step": 10325 }, { "epoch": 1.75, "grad_norm": 0.2265625, "learning_rate": 9.394571814897856e-06, "loss": 2.1679, "step": 10330 }, { "epoch": 1.75, "grad_norm": 0.2314453125, "learning_rate": 9.332122543404031e-06, "loss": 2.1152, "step": 10335 }, { "epoch": 1.75, "grad_norm": 0.224609375, "learning_rate": 9.269871366111494e-06, "loss": 2.1207, "step": 10340 }, { "epoch": 1.75, "grad_norm": 0.2265625, "learning_rate": 9.207818419028669e-06, "loss": 2.1568, "step": 10345 }, { "epoch": 1.75, "grad_norm": 0.2314453125, "learning_rate": 9.14596383773093e-06, "loss": 2.1264, "step": 10350 }, { "epoch": 1.75, "grad_norm": 0.2177734375, "learning_rate": 9.0843077573602e-06, "loss": 2.1534, "step": 10355 }, { "epoch": 1.75, "grad_norm": 0.228515625, "learning_rate": 9.02285031262473e-06, "loss": 2.1215, "step": 10360 }, { "epoch": 1.76, "grad_norm": 0.2294921875, "learning_rate": 8.961591637798827e-06, "loss": 2.1418, "step": 10365 }, { "epoch": 1.76, "grad_norm": 0.2216796875, "learning_rate": 8.900531866722472e-06, "loss": 2.1256, "step": 10370 }, { "epoch": 1.76, "grad_norm": 0.228515625, "learning_rate": 8.839671132801097e-06, "loss": 2.143, "step": 10375 }, { "epoch": 1.76, "grad_norm": 0.2255859375, "learning_rate": 8.779009569005236e-06, "loss": 2.1145, "step": 10380 }, { "epoch": 1.76, "grad_norm": 0.2265625, "learning_rate": 8.718547307870316e-06, "loss": 2.1316, "step": 10385 }, { "epoch": 1.76, "grad_norm": 0.228515625, "learning_rate": 8.658284481496303e-06, "loss": 2.165, "step": 10390 }, { "epoch": 1.76, "grad_norm": 0.2412109375, "learning_rate": 8.59822122154741e-06, "loss": 2.1197, "step": 10395 }, { "epoch": 1.76, "grad_norm": 0.228515625, "learning_rate": 8.538357659251872e-06, "loss": 2.1258, "step": 10400 }, { "epoch": 1.76, "grad_norm": 0.2236328125, "learning_rate": 8.478693925401604e-06, "loss": 2.1139, "step": 10405 }, { "epoch": 1.76, "grad_norm": 0.2275390625, "learning_rate": 8.419230150351886e-06, "loss": 2.1272, "step": 10410 }, { "epoch": 1.76, "grad_norm": 0.22265625, "learning_rate": 8.359966464021196e-06, "loss": 2.1235, "step": 10415 }, { "epoch": 1.76, "grad_norm": 0.234375, "learning_rate": 8.300902995890747e-06, "loss": 2.1193, "step": 10420 }, { "epoch": 1.77, "grad_norm": 0.2353515625, "learning_rate": 8.242039875004437e-06, "loss": 2.1293, "step": 10425 }, { "epoch": 1.77, "grad_norm": 0.2265625, "learning_rate": 8.18337722996837e-06, "loss": 2.1085, "step": 10430 }, { "epoch": 1.77, "grad_norm": 0.228515625, "learning_rate": 8.124915188950611e-06, "loss": 2.1161, "step": 10435 }, { "epoch": 1.77, "grad_norm": 0.228515625, "learning_rate": 8.066653879680997e-06, "loss": 2.0748, "step": 10440 }, { "epoch": 1.77, "grad_norm": 0.2333984375, "learning_rate": 8.008593429450806e-06, "loss": 2.1358, "step": 10445 }, { "epoch": 1.77, "grad_norm": 0.21875, "learning_rate": 7.950733965112378e-06, "loss": 2.1242, "step": 10450 }, { "epoch": 1.77, "grad_norm": 0.2294921875, "learning_rate": 7.893075613079048e-06, "loss": 2.1048, "step": 10455 }, { "epoch": 1.77, "grad_norm": 0.2236328125, "learning_rate": 7.835618499324726e-06, "loss": 2.0658, "step": 10460 }, { "epoch": 1.77, "grad_norm": 0.2275390625, "learning_rate": 7.778362749383571e-06, "loss": 2.1162, "step": 10465 }, { "epoch": 1.77, "grad_norm": 0.2275390625, "learning_rate": 7.72130848834991e-06, "loss": 2.148, "step": 10470 }, { "epoch": 1.77, "grad_norm": 0.2275390625, "learning_rate": 7.66445584087776e-06, "loss": 2.1371, "step": 10475 }, { "epoch": 1.77, "grad_norm": 0.224609375, "learning_rate": 7.607804931180662e-06, "loss": 2.0816, "step": 10480 }, { "epoch": 1.78, "grad_norm": 0.2294921875, "learning_rate": 7.5513558830314745e-06, "loss": 2.1102, "step": 10485 }, { "epoch": 1.78, "grad_norm": 0.228515625, "learning_rate": 7.495108819761898e-06, "loss": 2.1227, "step": 10490 }, { "epoch": 1.78, "grad_norm": 0.23046875, "learning_rate": 7.43906386426243e-06, "loss": 2.1205, "step": 10495 }, { "epoch": 1.78, "grad_norm": 0.2255859375, "learning_rate": 7.383221138981966e-06, "loss": 2.1385, "step": 10500 }, { "epoch": 1.78, "grad_norm": 0.228515625, "learning_rate": 7.3275807659275e-06, "loss": 2.0769, "step": 10505 }, { "epoch": 1.78, "grad_norm": 0.2236328125, "learning_rate": 7.272142866664023e-06, "loss": 2.1113, "step": 10510 }, { "epoch": 1.78, "grad_norm": 0.2294921875, "learning_rate": 7.216907562314079e-06, "loss": 2.1326, "step": 10515 }, { "epoch": 1.78, "grad_norm": 0.2197265625, "learning_rate": 7.161874973557625e-06, "loss": 2.1203, "step": 10520 }, { "epoch": 1.78, "grad_norm": 0.2255859375, "learning_rate": 7.107045220631692e-06, "loss": 2.1155, "step": 10525 }, { "epoch": 1.78, "grad_norm": 0.2255859375, "learning_rate": 7.05241842333012e-06, "loss": 2.1306, "step": 10530 }, { "epoch": 1.78, "grad_norm": 0.21875, "learning_rate": 6.9979947010033965e-06, "loss": 2.1211, "step": 10535 }, { "epoch": 1.79, "grad_norm": 0.2265625, "learning_rate": 6.943774172558259e-06, "loss": 2.1107, "step": 10540 }, { "epoch": 1.79, "grad_norm": 0.228515625, "learning_rate": 6.889756956457538e-06, "loss": 2.1414, "step": 10545 }, { "epoch": 1.79, "grad_norm": 0.2333984375, "learning_rate": 6.835943170719839e-06, "loss": 2.106, "step": 10550 }, { "epoch": 1.79, "grad_norm": 0.2265625, "learning_rate": 6.782332932919344e-06, "loss": 2.085, "step": 10555 }, { "epoch": 1.79, "grad_norm": 0.220703125, "learning_rate": 6.72892636018545e-06, "loss": 2.113, "step": 10560 }, { "epoch": 1.79, "grad_norm": 0.2294921875, "learning_rate": 6.6757235692026295e-06, "loss": 2.1459, "step": 10565 }, { "epoch": 1.79, "grad_norm": 0.2314453125, "learning_rate": 6.622724676210113e-06, "loss": 2.1171, "step": 10570 }, { "epoch": 1.79, "grad_norm": 0.228515625, "learning_rate": 6.569929797001651e-06, "loss": 2.0854, "step": 10575 }, { "epoch": 1.79, "grad_norm": 0.23046875, "learning_rate": 6.517339046925264e-06, "loss": 2.0713, "step": 10580 }, { "epoch": 1.79, "grad_norm": 0.2265625, "learning_rate": 6.4649525408829846e-06, "loss": 2.1328, "step": 10585 }, { "epoch": 1.79, "grad_norm": 0.228515625, "learning_rate": 6.412770393330558e-06, "loss": 2.0968, "step": 10590 }, { "epoch": 1.79, "grad_norm": 0.228515625, "learning_rate": 6.36079271827732e-06, "loss": 2.1114, "step": 10595 }, { "epoch": 1.8, "grad_norm": 0.2333984375, "learning_rate": 6.309019629285795e-06, "loss": 2.1412, "step": 10600 }, { "epoch": 1.8, "grad_norm": 0.240234375, "learning_rate": 6.257451239471579e-06, "loss": 2.1464, "step": 10605 }, { "epoch": 1.8, "grad_norm": 0.22265625, "learning_rate": 6.206087661503013e-06, "loss": 2.0909, "step": 10610 }, { "epoch": 1.8, "grad_norm": 0.23046875, "learning_rate": 6.154929007600929e-06, "loss": 2.1204, "step": 10615 }, { "epoch": 1.8, "grad_norm": 0.2265625, "learning_rate": 6.103975389538474e-06, "loss": 2.1196, "step": 10620 }, { "epoch": 1.8, "grad_norm": 0.2265625, "learning_rate": 6.053226918640809e-06, "loss": 2.1293, "step": 10625 }, { "epoch": 1.8, "grad_norm": 0.2236328125, "learning_rate": 6.002683705784884e-06, "loss": 2.1184, "step": 10630 }, { "epoch": 1.8, "grad_norm": 0.23046875, "learning_rate": 5.9523458613992e-06, "loss": 2.1225, "step": 10635 }, { "epoch": 1.8, "grad_norm": 0.2294921875, "learning_rate": 5.902213495463571e-06, "loss": 2.1736, "step": 10640 }, { "epoch": 1.8, "grad_norm": 0.2236328125, "learning_rate": 5.852286717508826e-06, "loss": 2.1048, "step": 10645 }, { "epoch": 1.8, "grad_norm": 0.234375, "learning_rate": 5.802565636616686e-06, "loss": 2.108, "step": 10650 }, { "epoch": 1.8, "grad_norm": 0.2236328125, "learning_rate": 5.753050361419388e-06, "loss": 2.1427, "step": 10655 }, { "epoch": 1.81, "grad_norm": 0.2333984375, "learning_rate": 5.703741000099594e-06, "loss": 2.1246, "step": 10660 }, { "epoch": 1.81, "grad_norm": 0.2216796875, "learning_rate": 5.65463766039005e-06, "loss": 2.1215, "step": 10665 }, { "epoch": 1.81, "grad_norm": 0.23046875, "learning_rate": 5.605740449573327e-06, "loss": 2.0739, "step": 10670 }, { "epoch": 1.81, "grad_norm": 0.23046875, "learning_rate": 5.557049474481702e-06, "loss": 2.1136, "step": 10675 }, { "epoch": 1.81, "grad_norm": 0.2314453125, "learning_rate": 5.508564841496855e-06, "loss": 2.0865, "step": 10680 }, { "epoch": 1.81, "grad_norm": 0.228515625, "learning_rate": 5.4602866565495845e-06, "loss": 2.1447, "step": 10685 }, { "epoch": 1.81, "grad_norm": 0.228515625, "learning_rate": 5.412215025119716e-06, "loss": 2.1571, "step": 10690 }, { "epoch": 1.81, "grad_norm": 0.21484375, "learning_rate": 5.364350052235767e-06, "loss": 2.1296, "step": 10695 }, { "epoch": 1.81, "grad_norm": 0.220703125, "learning_rate": 5.316691842474686e-06, "loss": 2.1316, "step": 10700 }, { "epoch": 1.81, "grad_norm": 0.23046875, "learning_rate": 5.269240499961747e-06, "loss": 2.1234, "step": 10705 }, { "epoch": 1.81, "grad_norm": 0.2265625, "learning_rate": 5.22199612837021e-06, "loss": 2.1094, "step": 10710 }, { "epoch": 1.81, "grad_norm": 0.2236328125, "learning_rate": 5.17495883092115e-06, "loss": 2.1003, "step": 10715 }, { "epoch": 1.82, "grad_norm": 0.224609375, "learning_rate": 5.1281287103832285e-06, "loss": 2.1195, "step": 10720 }, { "epoch": 1.82, "grad_norm": 0.2353515625, "learning_rate": 5.081505869072445e-06, "loss": 2.1281, "step": 10725 }, { "epoch": 1.82, "grad_norm": 0.228515625, "learning_rate": 5.035090408851961e-06, "loss": 2.1098, "step": 10730 }, { "epoch": 1.82, "grad_norm": 0.228515625, "learning_rate": 4.988882431131814e-06, "loss": 2.1547, "step": 10735 }, { "epoch": 1.82, "grad_norm": 0.2294921875, "learning_rate": 4.942882036868712e-06, "loss": 2.1152, "step": 10740 }, { "epoch": 1.82, "grad_norm": 0.2333984375, "learning_rate": 4.897089326565874e-06, "loss": 2.1086, "step": 10745 }, { "epoch": 1.82, "grad_norm": 0.234375, "learning_rate": 4.851504400272722e-06, "loss": 2.1177, "step": 10750 }, { "epoch": 1.82, "grad_norm": 0.2333984375, "learning_rate": 4.806127357584745e-06, "loss": 2.1149, "step": 10755 }, { "epoch": 1.82, "grad_norm": 0.2255859375, "learning_rate": 4.760958297643192e-06, "loss": 2.1224, "step": 10760 }, { "epoch": 1.82, "grad_norm": 0.2314453125, "learning_rate": 4.715997319134968e-06, "loss": 2.0825, "step": 10765 }, { "epoch": 1.82, "grad_norm": 0.2314453125, "learning_rate": 4.671244520292273e-06, "loss": 2.1383, "step": 10770 }, { "epoch": 1.82, "grad_norm": 0.232421875, "learning_rate": 4.626699998892548e-06, "loss": 2.1529, "step": 10775 }, { "epoch": 1.83, "grad_norm": 0.234375, "learning_rate": 4.58236385225812e-06, "loss": 2.1247, "step": 10780 }, { "epoch": 1.83, "grad_norm": 0.224609375, "learning_rate": 4.538236177256106e-06, "loss": 2.1216, "step": 10785 }, { "epoch": 1.83, "grad_norm": 0.236328125, "learning_rate": 4.4943170702981266e-06, "loss": 2.1224, "step": 10790 }, { "epoch": 1.83, "grad_norm": 0.236328125, "learning_rate": 4.45060662734007e-06, "loss": 2.1268, "step": 10795 }, { "epoch": 1.83, "grad_norm": 0.220703125, "learning_rate": 4.407104943882001e-06, "loss": 2.1131, "step": 10800 }, { "epoch": 1.83, "grad_norm": 0.23046875, "learning_rate": 4.363812114967847e-06, "loss": 2.1314, "step": 10805 }, { "epoch": 1.83, "grad_norm": 0.228515625, "learning_rate": 4.320728235185212e-06, "loss": 2.1682, "step": 10810 }, { "epoch": 1.83, "grad_norm": 0.2275390625, "learning_rate": 4.277853398665199e-06, "loss": 2.1185, "step": 10815 }, { "epoch": 1.83, "grad_norm": 0.23046875, "learning_rate": 4.2351876990821995e-06, "loss": 2.1275, "step": 10820 }, { "epoch": 1.83, "grad_norm": 0.2294921875, "learning_rate": 4.192731229653623e-06, "loss": 2.1367, "step": 10825 }, { "epoch": 1.83, "grad_norm": 0.2236328125, "learning_rate": 4.150484083139783e-06, "loss": 2.1177, "step": 10830 }, { "epoch": 1.84, "grad_norm": 0.2236328125, "learning_rate": 4.108446351843676e-06, "loss": 2.1122, "step": 10835 }, { "epoch": 1.84, "grad_norm": 0.228515625, "learning_rate": 4.066618127610722e-06, "loss": 2.1636, "step": 10840 }, { "epoch": 1.84, "grad_norm": 0.2255859375, "learning_rate": 4.0249995018286415e-06, "loss": 2.1378, "step": 10845 }, { "epoch": 1.84, "grad_norm": 0.2216796875, "learning_rate": 3.9835905654271535e-06, "loss": 2.1095, "step": 10850 }, { "epoch": 1.84, "grad_norm": 0.23046875, "learning_rate": 3.942391408877922e-06, "loss": 2.1403, "step": 10855 }, { "epoch": 1.84, "grad_norm": 0.2236328125, "learning_rate": 3.90140212219422e-06, "loss": 2.0605, "step": 10860 }, { "epoch": 1.84, "grad_norm": 0.2265625, "learning_rate": 3.860622794930801e-06, "loss": 2.0844, "step": 10865 }, { "epoch": 1.84, "grad_norm": 0.23046875, "learning_rate": 3.820053516183719e-06, "loss": 2.1389, "step": 10870 }, { "epoch": 1.84, "grad_norm": 0.2255859375, "learning_rate": 3.7796943745900924e-06, "loss": 2.132, "step": 10875 }, { "epoch": 1.84, "grad_norm": 0.2255859375, "learning_rate": 3.7395454583278868e-06, "loss": 2.1547, "step": 10880 }, { "epoch": 1.84, "grad_norm": 0.2353515625, "learning_rate": 3.6996068551158115e-06, "loss": 2.1167, "step": 10885 }, { "epoch": 1.84, "grad_norm": 0.232421875, "learning_rate": 3.659878652213056e-06, "loss": 2.0709, "step": 10890 }, { "epoch": 1.85, "grad_norm": 0.220703125, "learning_rate": 3.620360936419109e-06, "loss": 2.1322, "step": 10895 }, { "epoch": 1.85, "grad_norm": 0.2275390625, "learning_rate": 3.581053794073619e-06, "loss": 2.1527, "step": 10900 }, { "epoch": 1.85, "grad_norm": 0.2275390625, "learning_rate": 3.541957311056132e-06, "loss": 2.0757, "step": 10905 }, { "epoch": 1.85, "grad_norm": 0.224609375, "learning_rate": 3.503071572785932e-06, "loss": 2.1048, "step": 10910 }, { "epoch": 1.85, "grad_norm": 0.2314453125, "learning_rate": 3.4643966642219137e-06, "loss": 2.126, "step": 10915 }, { "epoch": 1.85, "grad_norm": 0.228515625, "learning_rate": 3.425932669862264e-06, "loss": 2.1336, "step": 10920 }, { "epoch": 1.85, "grad_norm": 0.23828125, "learning_rate": 3.387679673744404e-06, "loss": 2.1349, "step": 10925 }, { "epoch": 1.85, "grad_norm": 0.2197265625, "learning_rate": 3.3496377594447905e-06, "loss": 2.1169, "step": 10930 }, { "epoch": 1.85, "grad_norm": 0.2265625, "learning_rate": 3.311807010078627e-06, "loss": 2.1101, "step": 10935 }, { "epoch": 1.85, "grad_norm": 0.2314453125, "learning_rate": 3.2741875082998195e-06, "loss": 2.1645, "step": 10940 }, { "epoch": 1.85, "grad_norm": 0.2294921875, "learning_rate": 3.2367793363007213e-06, "loss": 2.1003, "step": 10945 }, { "epoch": 1.85, "grad_norm": 0.2314453125, "learning_rate": 3.19958257581191e-06, "loss": 2.1141, "step": 10950 }, { "epoch": 1.86, "grad_norm": 0.224609375, "learning_rate": 3.162597308102144e-06, "loss": 2.1581, "step": 10955 }, { "epoch": 1.86, "grad_norm": 0.2314453125, "learning_rate": 3.125823613978052e-06, "loss": 2.0951, "step": 10960 }, { "epoch": 1.86, "grad_norm": 0.232421875, "learning_rate": 3.0892615737840413e-06, "loss": 2.07, "step": 10965 }, { "epoch": 1.86, "grad_norm": 0.22265625, "learning_rate": 3.05291126740207e-06, "loss": 2.1168, "step": 10970 }, { "epoch": 1.86, "grad_norm": 0.22265625, "learning_rate": 3.0167727742514974e-06, "loss": 2.1106, "step": 10975 }, { "epoch": 1.86, "grad_norm": 0.232421875, "learning_rate": 2.980846173288898e-06, "loss": 2.1058, "step": 10980 }, { "epoch": 1.86, "grad_norm": 0.2314453125, "learning_rate": 2.9451315430079174e-06, "loss": 2.0987, "step": 10985 }, { "epoch": 1.86, "grad_norm": 0.2314453125, "learning_rate": 2.9096289614390815e-06, "loss": 2.0906, "step": 10990 }, { "epoch": 1.86, "grad_norm": 0.2392578125, "learning_rate": 2.8743385061495876e-06, "loss": 2.1334, "step": 10995 }, { "epoch": 1.86, "grad_norm": 0.2275390625, "learning_rate": 2.8392602542432366e-06, "loss": 2.1099, "step": 11000 }, { "epoch": 1.86, "grad_norm": 0.234375, "learning_rate": 2.8043942823601233e-06, "loss": 2.0759, "step": 11005 }, { "epoch": 1.86, "grad_norm": 0.2236328125, "learning_rate": 2.7697406666766123e-06, "loss": 2.1445, "step": 11010 }, { "epoch": 1.87, "grad_norm": 0.23046875, "learning_rate": 2.7352994829050627e-06, "loss": 2.1399, "step": 11015 }, { "epoch": 1.87, "grad_norm": 0.232421875, "learning_rate": 2.701070806293726e-06, "loss": 2.1307, "step": 11020 }, { "epoch": 1.87, "grad_norm": 0.2236328125, "learning_rate": 2.66705471162656e-06, "loss": 2.1047, "step": 11025 }, { "epoch": 1.87, "grad_norm": 0.2255859375, "learning_rate": 2.6332512732230585e-06, "loss": 2.102, "step": 11030 }, { "epoch": 1.87, "grad_norm": 0.2353515625, "learning_rate": 2.5996605649381e-06, "loss": 2.1327, "step": 11035 }, { "epoch": 1.87, "grad_norm": 0.2294921875, "learning_rate": 2.5662826601617783e-06, "loss": 2.1174, "step": 11040 }, { "epoch": 1.87, "grad_norm": 0.23046875, "learning_rate": 2.5331176318192706e-06, "loss": 2.1236, "step": 11045 }, { "epoch": 1.87, "grad_norm": 0.232421875, "learning_rate": 2.500165552370615e-06, "loss": 2.0935, "step": 11050 }, { "epoch": 1.87, "grad_norm": 0.224609375, "learning_rate": 2.467426493810643e-06, "loss": 2.1414, "step": 11055 }, { "epoch": 1.87, "grad_norm": 0.220703125, "learning_rate": 2.4349005276687042e-06, "loss": 2.1383, "step": 11060 }, { "epoch": 1.87, "grad_norm": 0.224609375, "learning_rate": 2.4025877250086316e-06, "loss": 2.1079, "step": 11065 }, { "epoch": 1.87, "grad_norm": 0.2255859375, "learning_rate": 2.3704881564285184e-06, "loss": 2.1241, "step": 11070 }, { "epoch": 1.88, "grad_norm": 0.23046875, "learning_rate": 2.338601892060566e-06, "loss": 2.0867, "step": 11075 }, { "epoch": 1.88, "grad_norm": 0.2265625, "learning_rate": 2.3069290015709565e-06, "loss": 2.1409, "step": 11080 }, { "epoch": 1.88, "grad_norm": 0.2265625, "learning_rate": 2.2754695541596593e-06, "loss": 2.1097, "step": 11085 }, { "epoch": 1.88, "grad_norm": 0.2333984375, "learning_rate": 2.2442236185603262e-06, "loss": 2.0971, "step": 11090 }, { "epoch": 1.88, "grad_norm": 0.220703125, "learning_rate": 2.2131912630401485e-06, "loss": 2.1069, "step": 11095 }, { "epoch": 1.88, "grad_norm": 0.2275390625, "learning_rate": 2.182372555399603e-06, "loss": 2.1526, "step": 11100 }, { "epoch": 1.88, "grad_norm": 0.228515625, "learning_rate": 2.151767562972462e-06, "loss": 2.1291, "step": 11105 }, { "epoch": 1.88, "grad_norm": 0.234375, "learning_rate": 2.121376352625537e-06, "loss": 2.0771, "step": 11110 }, { "epoch": 1.88, "grad_norm": 0.2255859375, "learning_rate": 2.091198990758547e-06, "loss": 2.1103, "step": 11115 }, { "epoch": 1.88, "grad_norm": 0.232421875, "learning_rate": 2.0612355433039965e-06, "loss": 2.1198, "step": 11120 }, { "epoch": 1.88, "grad_norm": 0.2255859375, "learning_rate": 2.0314860757270295e-06, "loss": 2.1741, "step": 11125 }, { "epoch": 1.89, "grad_norm": 0.2275390625, "learning_rate": 2.001950653025253e-06, "loss": 2.1404, "step": 11130 }, { "epoch": 1.89, "grad_norm": 0.2236328125, "learning_rate": 1.9726293397286823e-06, "loss": 2.1171, "step": 11135 }, { "epoch": 1.89, "grad_norm": 0.2265625, "learning_rate": 1.943522199899472e-06, "loss": 2.12, "step": 11140 }, { "epoch": 1.89, "grad_norm": 0.2255859375, "learning_rate": 1.914629297131876e-06, "loss": 2.1035, "step": 11145 }, { "epoch": 1.89, "grad_norm": 0.23046875, "learning_rate": 1.8859506945520856e-06, "loss": 2.1324, "step": 11150 }, { "epoch": 1.89, "grad_norm": 0.2333984375, "learning_rate": 1.857486454818047e-06, "loss": 2.0816, "step": 11155 }, { "epoch": 1.89, "grad_norm": 0.2255859375, "learning_rate": 1.8292366401193805e-06, "loss": 2.1412, "step": 11160 }, { "epoch": 1.89, "grad_norm": 0.224609375, "learning_rate": 1.8012013121772475e-06, "loss": 2.1293, "step": 11165 }, { "epoch": 1.89, "grad_norm": 0.2216796875, "learning_rate": 1.7733805322441398e-06, "loss": 2.0747, "step": 11170 }, { "epoch": 1.89, "grad_norm": 0.2265625, "learning_rate": 1.7457743611038468e-06, "loss": 2.126, "step": 11175 }, { "epoch": 1.89, "grad_norm": 0.22265625, "learning_rate": 1.7183828590712436e-06, "loss": 2.102, "step": 11180 }, { "epoch": 1.89, "grad_norm": 0.2255859375, "learning_rate": 1.691206085992192e-06, "loss": 2.1216, "step": 11185 }, { "epoch": 1.9, "grad_norm": 0.2255859375, "learning_rate": 1.6642441012434172e-06, "loss": 2.1466, "step": 11190 }, { "epoch": 1.9, "grad_norm": 0.22265625, "learning_rate": 1.6374969637323545e-06, "loss": 2.1029, "step": 11195 }, { "epoch": 1.9, "grad_norm": 0.2255859375, "learning_rate": 1.6109647318970466e-06, "loss": 2.1073, "step": 11200 }, { "epoch": 1.9, "grad_norm": 0.2197265625, "learning_rate": 1.5846474637060015e-06, "loss": 2.0883, "step": 11205 }, { "epoch": 1.9, "grad_norm": 0.228515625, "learning_rate": 1.5585452166580583e-06, "loss": 2.1062, "step": 11210 }, { "epoch": 1.9, "grad_norm": 0.2236328125, "learning_rate": 1.5326580477822761e-06, "loss": 2.167, "step": 11215 }, { "epoch": 1.9, "grad_norm": 0.2255859375, "learning_rate": 1.5069860136378121e-06, "loss": 2.1129, "step": 11220 }, { "epoch": 1.9, "grad_norm": 0.2353515625, "learning_rate": 1.481529170313778e-06, "loss": 2.075, "step": 11225 }, { "epoch": 1.9, "grad_norm": 0.2314453125, "learning_rate": 1.456287573429138e-06, "loss": 2.1242, "step": 11230 }, { "epoch": 1.9, "grad_norm": 0.224609375, "learning_rate": 1.4312612781325785e-06, "loss": 2.1539, "step": 11235 }, { "epoch": 1.9, "grad_norm": 0.2255859375, "learning_rate": 1.406450339102361e-06, "loss": 2.0581, "step": 11240 }, { "epoch": 1.9, "grad_norm": 0.2236328125, "learning_rate": 1.381854810546268e-06, "loss": 2.1453, "step": 11245 }, { "epoch": 1.91, "grad_norm": 0.228515625, "learning_rate": 1.357474746201426e-06, "loss": 2.1207, "step": 11250 }, { "epoch": 1.91, "grad_norm": 0.2490234375, "learning_rate": 1.3333101993342145e-06, "loss": 2.1136, "step": 11255 }, { "epoch": 1.91, "grad_norm": 0.22265625, "learning_rate": 1.3093612227401576e-06, "loss": 2.0805, "step": 11260 }, { "epoch": 1.91, "grad_norm": 0.2216796875, "learning_rate": 1.285627868743744e-06, "loss": 2.1168, "step": 11265 }, { "epoch": 1.91, "grad_norm": 0.228515625, "learning_rate": 1.2621101891984289e-06, "loss": 2.0865, "step": 11270 }, { "epoch": 1.91, "grad_norm": 0.232421875, "learning_rate": 1.2388082354863994e-06, "loss": 2.1729, "step": 11275 }, { "epoch": 1.91, "grad_norm": 0.232421875, "learning_rate": 1.2157220585185536e-06, "loss": 2.0999, "step": 11280 }, { "epoch": 1.91, "grad_norm": 0.22265625, "learning_rate": 1.1928517087343327e-06, "loss": 2.1423, "step": 11285 }, { "epoch": 1.91, "grad_norm": 0.2294921875, "learning_rate": 1.1701972361016443e-06, "loss": 2.1503, "step": 11290 }, { "epoch": 1.91, "grad_norm": 0.2265625, "learning_rate": 1.1477586901167403e-06, "loss": 2.1066, "step": 11295 }, { "epoch": 1.91, "grad_norm": 0.22265625, "learning_rate": 1.1255361198040938e-06, "loss": 2.1753, "step": 11300 }, { "epoch": 1.91, "grad_norm": 0.228515625, "learning_rate": 1.1035295737163221e-06, "loss": 2.1592, "step": 11305 }, { "epoch": 1.92, "grad_norm": 0.2265625, "learning_rate": 1.0817390999340537e-06, "loss": 2.1417, "step": 11310 }, { "epoch": 1.92, "grad_norm": 0.2294921875, "learning_rate": 1.0601647460658615e-06, "loss": 2.1685, "step": 11315 }, { "epoch": 1.92, "grad_norm": 0.220703125, "learning_rate": 1.0388065592480956e-06, "loss": 2.0922, "step": 11320 }, { "epoch": 1.92, "grad_norm": 0.2275390625, "learning_rate": 1.0176645861448285e-06, "loss": 2.1161, "step": 11325 }, { "epoch": 1.92, "grad_norm": 0.23046875, "learning_rate": 9.967388729477779e-07, "loss": 2.1453, "step": 11330 }, { "epoch": 1.92, "grad_norm": 0.2216796875, "learning_rate": 9.760294653761048e-07, "loss": 2.1358, "step": 11335 }, { "epoch": 1.92, "grad_norm": 0.2294921875, "learning_rate": 9.555364086764273e-07, "loss": 2.0958, "step": 11340 }, { "epoch": 1.92, "grad_norm": 0.22265625, "learning_rate": 9.352597476226743e-07, "loss": 2.1375, "step": 11345 }, { "epoch": 1.92, "grad_norm": 0.22265625, "learning_rate": 9.15199526515953e-07, "loss": 2.1524, "step": 11350 }, { "epoch": 1.92, "grad_norm": 0.2275390625, "learning_rate": 8.953557891844933e-07, "loss": 2.0964, "step": 11355 }, { "epoch": 1.92, "grad_norm": 0.2392578125, "learning_rate": 8.757285789835923e-07, "loss": 2.1318, "step": 11360 }, { "epoch": 1.92, "grad_norm": 0.220703125, "learning_rate": 8.563179387953812e-07, "loss": 2.1265, "step": 11365 }, { "epoch": 1.93, "grad_norm": 0.236328125, "learning_rate": 8.371239110289252e-07, "loss": 2.1221, "step": 11370 }, { "epoch": 1.93, "grad_norm": 0.224609375, "learning_rate": 8.181465376199348e-07, "loss": 2.1379, "step": 11375 }, { "epoch": 1.93, "grad_norm": 0.251953125, "learning_rate": 7.993858600308324e-07, "loss": 2.1531, "step": 11380 }, { "epoch": 1.93, "grad_norm": 0.228515625, "learning_rate": 7.808419192505745e-07, "loss": 2.1383, "step": 11385 }, { "epoch": 1.93, "grad_norm": 0.2265625, "learning_rate": 7.625147557945633e-07, "loss": 2.174, "step": 11390 }, { "epoch": 1.93, "grad_norm": 0.2197265625, "learning_rate": 7.44404409704591e-07, "loss": 2.1006, "step": 11395 }, { "epoch": 1.93, "grad_norm": 0.22265625, "learning_rate": 7.26510920548773e-07, "loss": 2.1225, "step": 11400 }, { "epoch": 1.93, "grad_norm": 0.22265625, "learning_rate": 7.088343274213926e-07, "loss": 2.0748, "step": 11405 }, { "epoch": 1.93, "grad_norm": 0.228515625, "learning_rate": 6.913746689428458e-07, "loss": 2.1151, "step": 11410 }, { "epoch": 1.93, "grad_norm": 0.2314453125, "learning_rate": 6.741319832595849e-07, "loss": 2.1177, "step": 11415 }, { "epoch": 1.93, "grad_norm": 0.2275390625, "learning_rate": 6.571063080440087e-07, "loss": 2.1477, "step": 11420 }, { "epoch": 1.93, "grad_norm": 0.244140625, "learning_rate": 6.402976804943728e-07, "loss": 2.1342, "step": 11425 }, { "epoch": 1.94, "grad_norm": 0.2236328125, "learning_rate": 6.23706137334723e-07, "loss": 2.1416, "step": 11430 }, { "epoch": 1.94, "grad_norm": 0.2197265625, "learning_rate": 6.073317148148294e-07, "loss": 2.0855, "step": 11435 }, { "epoch": 1.94, "grad_norm": 0.224609375, "learning_rate": 5.911744487100745e-07, "loss": 2.1301, "step": 11440 }, { "epoch": 1.94, "grad_norm": 0.22265625, "learning_rate": 5.752343743213873e-07, "loss": 2.1179, "step": 11445 }, { "epoch": 1.94, "grad_norm": 0.23046875, "learning_rate": 5.595115264751649e-07, "loss": 2.0996, "step": 11450 }, { "epoch": 1.94, "grad_norm": 0.2314453125, "learning_rate": 5.440059395232178e-07, "loss": 2.128, "step": 11455 }, { "epoch": 1.94, "grad_norm": 0.224609375, "learning_rate": 5.287176473426692e-07, "loss": 2.1684, "step": 11460 }, { "epoch": 1.94, "grad_norm": 0.2314453125, "learning_rate": 5.136466833358999e-07, "loss": 2.1402, "step": 11465 }, { "epoch": 1.94, "grad_norm": 0.2236328125, "learning_rate": 4.987930804304375e-07, "loss": 2.0991, "step": 11470 }, { "epoch": 1.94, "grad_norm": 0.2236328125, "learning_rate": 4.841568710789335e-07, "loss": 2.0907, "step": 11475 }, { "epoch": 1.94, "grad_norm": 0.23046875, "learning_rate": 4.697380872590751e-07, "loss": 2.1306, "step": 11480 }, { "epoch": 1.95, "grad_norm": 0.228515625, "learning_rate": 4.55536760473485e-07, "loss": 2.1029, "step": 11485 }, { "epoch": 1.95, "grad_norm": 0.232421875, "learning_rate": 4.4155292174971054e-07, "loss": 2.1302, "step": 11490 }, { "epoch": 1.95, "grad_norm": 0.2275390625, "learning_rate": 4.2778660164011217e-07, "loss": 2.1341, "step": 11495 }, { "epoch": 1.95, "grad_norm": 0.2373046875, "learning_rate": 4.142378302217864e-07, "loss": 2.1308, "step": 11500 }, { "epoch": 1.95, "grad_norm": 0.23046875, "learning_rate": 4.0090663709655417e-07, "loss": 2.1504, "step": 11505 }, { "epoch": 1.95, "grad_norm": 0.2275390625, "learning_rate": 3.877930513908501e-07, "loss": 2.1156, "step": 11510 }, { "epoch": 1.95, "grad_norm": 0.228515625, "learning_rate": 3.7489710175566686e-07, "loss": 2.1101, "step": 11515 }, { "epoch": 1.95, "grad_norm": 0.2314453125, "learning_rate": 3.622188163664997e-07, "loss": 2.1366, "step": 11520 }, { "epoch": 1.95, "grad_norm": 0.2373046875, "learning_rate": 3.4975822292331317e-07, "loss": 2.1219, "step": 11525 }, { "epoch": 1.95, "grad_norm": 0.2265625, "learning_rate": 3.375153486504079e-07, "loss": 2.0802, "step": 11530 }, { "epoch": 1.95, "grad_norm": 0.2265625, "learning_rate": 3.254902202964205e-07, "loss": 2.117, "step": 11535 }, { "epoch": 1.95, "grad_norm": 0.2197265625, "learning_rate": 3.1368286413426817e-07, "loss": 2.0799, "step": 11540 }, { "epoch": 1.96, "grad_norm": 0.2314453125, "learning_rate": 3.0209330596104866e-07, "loss": 2.1347, "step": 11545 }, { "epoch": 1.96, "grad_norm": 0.236328125, "learning_rate": 2.9072157109800714e-07, "loss": 2.1671, "step": 11550 }, { "epoch": 1.96, "grad_norm": 0.2353515625, "learning_rate": 2.7956768439050265e-07, "loss": 2.1185, "step": 11555 }, { "epoch": 1.96, "grad_norm": 0.224609375, "learning_rate": 2.686316702079084e-07, "loss": 2.1238, "step": 11560 }, { "epoch": 1.96, "grad_norm": 0.2294921875, "learning_rate": 2.579135524436005e-07, "loss": 2.1151, "step": 11565 }, { "epoch": 1.96, "grad_norm": 0.2255859375, "learning_rate": 2.4741335451488047e-07, "loss": 2.0894, "step": 11570 }, { "epoch": 1.96, "grad_norm": 0.23046875, "learning_rate": 2.3713109936291944e-07, "loss": 2.1385, "step": 11575 }, { "epoch": 1.96, "grad_norm": 0.2236328125, "learning_rate": 2.2706680945273617e-07, "loss": 2.1139, "step": 11580 }, { "epoch": 1.96, "grad_norm": 0.224609375, "learning_rate": 2.1722050677313032e-07, "loss": 2.1246, "step": 11585 }, { "epoch": 1.96, "grad_norm": 0.2265625, "learning_rate": 2.075922128366381e-07, "loss": 2.1213, "step": 11590 }, { "epoch": 1.96, "grad_norm": 0.2265625, "learning_rate": 1.981819486794656e-07, "loss": 2.1515, "step": 11595 }, { "epoch": 1.96, "grad_norm": 0.2294921875, "learning_rate": 1.8898973486146664e-07, "loss": 2.1291, "step": 11600 }, { "epoch": 1.97, "grad_norm": 0.224609375, "learning_rate": 1.8001559146612058e-07, "loss": 2.112, "step": 11605 }, { "epoch": 1.97, "grad_norm": 0.2373046875, "learning_rate": 1.7125953810041007e-07, "loss": 2.1438, "step": 11610 }, { "epoch": 1.97, "grad_norm": 0.232421875, "learning_rate": 1.6272159389486564e-07, "loss": 2.1497, "step": 11615 }, { "epoch": 1.97, "grad_norm": 0.228515625, "learning_rate": 1.5440177750346563e-07, "loss": 2.1304, "step": 11620 }, { "epoch": 1.97, "grad_norm": 0.2265625, "learning_rate": 1.4630010710363628e-07, "loss": 2.1374, "step": 11625 }, { "epoch": 1.97, "grad_norm": 0.22265625, "learning_rate": 1.384166003961518e-07, "loss": 2.137, "step": 11630 }, { "epoch": 1.97, "grad_norm": 0.2255859375, "learning_rate": 1.3075127460518976e-07, "loss": 2.1289, "step": 11635 }, { "epoch": 1.97, "grad_norm": 0.2275390625, "learning_rate": 1.23304146478187e-07, "loss": 2.147, "step": 11640 }, { "epoch": 1.97, "grad_norm": 0.2265625, "learning_rate": 1.1607523228588379e-07, "loss": 2.1013, "step": 11645 }, { "epoch": 1.97, "grad_norm": 0.2216796875, "learning_rate": 1.090645478222574e-07, "loss": 2.1047, "step": 11650 }, { "epoch": 1.97, "grad_norm": 0.224609375, "learning_rate": 1.0227210840448864e-07, "loss": 2.116, "step": 11655 }, { "epoch": 1.97, "grad_norm": 0.2158203125, "learning_rate": 9.569792887290651e-08, "loss": 2.0968, "step": 11660 }, { "epoch": 1.98, "grad_norm": 0.23046875, "learning_rate": 8.934202359102139e-08, "loss": 2.1216, "step": 11665 }, { "epoch": 1.98, "grad_norm": 0.2275390625, "learning_rate": 8.320440644541405e-08, "loss": 2.1317, "step": 11670 }, { "epoch": 1.98, "grad_norm": 0.2265625, "learning_rate": 7.728509084574676e-08, "loss": 2.1112, "step": 11675 }, { "epoch": 1.98, "grad_norm": 0.2294921875, "learning_rate": 7.158408972476327e-08, "loss": 2.1439, "step": 11680 }, { "epoch": 1.98, "grad_norm": 0.2255859375, "learning_rate": 6.610141553816674e-08, "loss": 2.084, "step": 11685 }, { "epoch": 1.98, "grad_norm": 0.2333984375, "learning_rate": 6.083708026471957e-08, "loss": 2.1251, "step": 11690 }, { "epoch": 1.98, "grad_norm": 0.22265625, "learning_rate": 5.579109540609917e-08, "loss": 2.1635, "step": 11695 }, { "epoch": 1.98, "grad_norm": 0.2275390625, "learning_rate": 5.096347198694229e-08, "loss": 2.0743, "step": 11700 }, { "epoch": 1.98, "grad_norm": 0.2373046875, "learning_rate": 4.6354220554800655e-08, "loss": 2.1271, "step": 11705 }, { "epoch": 1.98, "grad_norm": 0.22265625, "learning_rate": 4.196335118012984e-08, "loss": 2.0982, "step": 11710 }, { "epoch": 1.98, "grad_norm": 0.228515625, "learning_rate": 3.779087345624488e-08, "loss": 2.1128, "step": 11715 }, { "epoch": 1.98, "grad_norm": 0.23046875, "learning_rate": 3.383679649929805e-08, "loss": 2.1136, "step": 11720 }, { "epoch": 1.99, "grad_norm": 0.228515625, "learning_rate": 3.010112894831219e-08, "loss": 2.1439, "step": 11725 }, { "epoch": 1.99, "grad_norm": 0.2255859375, "learning_rate": 2.6583878965080745e-08, "loss": 2.1431, "step": 11730 }, { "epoch": 1.99, "grad_norm": 0.224609375, "learning_rate": 2.3285054234223334e-08, "loss": 2.1114, "step": 11735 }, { "epoch": 1.99, "grad_norm": 0.2275390625, "learning_rate": 2.0204661963107996e-08, "loss": 2.1307, "step": 11740 }, { "epoch": 1.99, "grad_norm": 0.2275390625, "learning_rate": 1.7342708881884496e-08, "loss": 2.1688, "step": 11745 }, { "epoch": 1.99, "grad_norm": 0.2255859375, "learning_rate": 1.469920124343993e-08, "loss": 2.1105, "step": 11750 }, { "epoch": 1.99, "grad_norm": 0.2314453125, "learning_rate": 1.2274144823409828e-08, "loss": 2.1762, "step": 11755 }, { "epoch": 1.99, "grad_norm": 0.23828125, "learning_rate": 1.006754492012263e-08, "loss": 2.174, "step": 11760 }, { "epoch": 1.99, "grad_norm": 0.2275390625, "learning_rate": 8.079406354644103e-09, "loss": 2.1249, "step": 11765 }, { "epoch": 1.99, "grad_norm": 0.2294921875, "learning_rate": 6.309733470721835e-09, "loss": 2.096, "step": 11770 }, { "epoch": 1.99, "grad_norm": 0.22265625, "learning_rate": 4.758530134785222e-09, "loss": 2.089, "step": 11775 }, { "epoch": 2.0, "grad_norm": 0.2265625, "learning_rate": 3.425799735978785e-09, "loss": 2.1153, "step": 11780 }, { "epoch": 2.0, "grad_norm": 0.2265625, "learning_rate": 2.3115451860733495e-09, "loss": 2.1192, "step": 11785 }, { "epoch": 2.0, "grad_norm": 0.236328125, "learning_rate": 1.4157689195326563e-09, "loss": 2.1282, "step": 11790 }, { "epoch": 2.0, "grad_norm": 0.2353515625, "learning_rate": 7.3847289349116e-10, "loss": 2.0832, "step": 11795 }, { "epoch": 2.0, "grad_norm": 0.2265625, "learning_rate": 2.796585877207214e-10, "loss": 2.1107, "step": 11800 }, { "epoch": 2.0, "grad_norm": 0.2353515625, "learning_rate": 3.932700465281158e-11, "loss": 2.1311, "step": 11805 }, { "epoch": 2.0, "eval_loss": 2.1430556774139404, "eval_runtime": 161.578, "eval_samples_per_second": 16.444, "eval_steps_per_second": 2.061, "step": 11808 }, { "epoch": 2.0, "step": 11808, "total_flos": 6.077393230092042e+17, "train_loss": 2.1527459967507903, "train_runtime": 22011.903, "train_samples_per_second": 4.292, "train_steps_per_second": 0.536 } ], "logging_steps": 5, "max_steps": 11808, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 6.077393230092042e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }